diff --git a/test/Project.toml b/test/Project.toml
index a44cae6b..ad73c325 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -8,6 +8,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
@@ -23,4 +24,8 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
 
 [compat]
-pocl_jll = "7.0"
+pocl_jll = "~7.0"
+ParallelTestRunner = "1.0.1"
+
+[sources]
+ParallelTestRunner = {url="https://github.com/JuliaTesting/ParallelTestRunner.jl", rev="tb/testsuite"}
diff --git a/test/array.jl b/test/array.jl
index afe97e23..53fcd9b5 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -1,107 +1,105 @@
 using LinearAlgebra
 import Adapt
 
-@testset "CLArray" begin
-    @testset "constructors" begin
-        xs = CLArray{Int, 2, cl.Buffer}(undef, 2, 3)
-        @test collect(CLArray([1 2; 3 4])) == [1 2; 3 4]
-        @test testf(vec, rand(Float32, 5, 3))
-        @test Base.elsize(xs) == sizeof(Int)
-        @test CLArray{Int, 2}(xs) === xs
-
-        @test device_accessible(xs)
-        @test !host_accessible(xs)
-        @test_throws ArgumentError Base.unsafe_convert(Ptr{Int}, xs)
-        @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs)
-
-        @test collect(OpenCL.zeros(Float32, 2, 2)) == zeros(Float32, 2, 2)
-        @test collect(OpenCL.ones(Float32, 2, 2)) == ones(Float32, 2, 2)
-
-        @test collect(OpenCL.fill(0, 2, 2)) == zeros(Int, 2, 2)
-        @test collect(OpenCL.fill(1, 2, 2)) == ones(Int, 2, 2)
-    end
+@testset "constructors" begin
+    xs = CLArray{Int, 2, cl.Buffer}(undef, 2, 3)
+    @test collect(CLArray([1 2; 3 4])) == [1 2; 3 4]
+    @test testf(vec, rand(Float32, 5, 3))
+    @test Base.elsize(xs) == sizeof(Int)
+    @test CLArray{Int, 2}(xs) === xs
+
+    @test device_accessible(xs)
+    @test !host_accessible(xs)
+    @test_throws ArgumentError Base.unsafe_convert(Ptr{Int}, xs)
+    @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs)
+
+    @test collect(OpenCL.zeros(Float32, 2, 2)) == zeros(Float32, 2, 2)
+    @test collect(OpenCL.ones(Float32, 2, 2)) == ones(Float32, 2, 2)
+
+    @test collect(OpenCL.fill(0, 2, 2)) == zeros(Int, 2, 2)
+    @test collect(OpenCL.fill(1, 2, 2)) == ones(Int, 2, 2)
+end
 
-    @testset "adapt" begin
-        A = rand(Float32, 3, 3)
-        dA = CLArray(A)
-        @test Adapt.adapt(Array, dA) == A
-        @test Adapt.adapt(CLArray, A) isa CLArray
-        @test Array(Adapt.adapt(CLArray, A)) == A
-    end
+@testset "adapt" begin
+    A = rand(Float32, 3, 3)
+    dA = CLArray(A)
+    @test Adapt.adapt(Array, dA) == A
+    @test Adapt.adapt(CLArray, A) isa CLArray
+    @test Array(Adapt.adapt(CLArray, A)) == A
+end
 
-    @testset "reshape" begin
-        A = [
-            1 2 3 4
-            5 6 7 8
-        ]
-        gA = reshape(CLArray(A), 1, 8)
-        _A = reshape(A, 1, 8)
-        _gA = Array(gA)
-        @test all(_A .== _gA)
-        A = [1, 2, 3, 4]
-        gA = reshape(CLArray(A), 4)
-    end
+@testset "reshape" begin
+    A = [
+        1 2 3 4
+        5 6 7 8
+    ]
+    gA = reshape(CLArray(A), 1, 8)
+    _A = reshape(A, 1, 8)
+    _gA = Array(gA)
+    @test all(_A .== _gA)
+    A = [1, 2, 3, 4]
+    gA = reshape(CLArray(A), 4)
+end
 
-    @testset "fill(::SubArray)" begin
-        xs = OpenCL.zeros(Float32, 3)
-        fill!(view(xs, 2:2), 1)
-        @test Array(xs) == [0, 1, 0]
+@testset "fill(::SubArray)" begin
+    xs = OpenCL.zeros(Float32, 3)
+    fill!(view(xs, 2:2), 1)
+    @test Array(xs) == [0, 1, 0]
+end
+# TODO: Look into how to port the @sync
+
+if cl.memory_backend() isa cl.USMBackend
+    @testset "shared buffers & unsafe_wrap" begin
+        a = CLVector{Int, cl.UnifiedSharedMemory}(undef, 2)
+
+        # check that basic operations work on arrays backed by shared memory
+        fill!(a, 40)
+        a .+= 2
+        @test Array(a) == [42, 42]
+
+        # derive an Array object and test that the memory keeps in sync
+        b = unsafe_wrap(Array, a)
+        b[1] = 100
+        @test Array(a) == [100, 42]
+        copyto!(a, 2, [200], 1, 1)
+        cl.finish(cl.queue())
+        @test b == [100, 200]
     end
-    # TODO: Look into how to port the @sync
-
-    if cl.memory_backend() isa cl.USMBackend
-        @testset "shared buffers & unsafe_wrap" begin
-            a = CLVector{Int, cl.UnifiedSharedMemory}(undef, 2)
-
-            # check that basic operations work on arrays backed by shared memory
-            fill!(a, 40)
-            a .+= 2
-            @test Array(a) == [42, 42]
-
-            # derive an Array object and test that the memory keeps in sync
-            b = unsafe_wrap(Array, a)
-            b[1] = 100
-            @test Array(a) == [100, 42]
-            copyto!(a, 2, [200], 1, 1)
-            cl.finish(cl.queue())
-            @test b == [100, 200]
-        end
-
-        # https://github.com/JuliaGPU/CUDA.jl/issues/2191
-        @testset "preserving memory types" begin
-            a = CLVector{Int, cl.UnifiedSharedMemory}([1])
-            @test OpenCL.memtype(a) == cl.UnifiedSharedMemory
-
-            # unified-ness should be preserved
-            b = a .+ 1
-            @test OpenCL.memtype(b) == cl.UnifiedSharedMemory
-
-            # when there's a conflict, we should defer to unified memory
-            c = CLVector{Int, cl.UnifiedSharedMemory}([1])
-            d = CLVector{Int, cl.UnifiedDeviceMemory}([1])
-            e = c .+ d
-            @test OpenCL.memtype(e) == cl.UnifiedSharedMemory
-        end
+
+    # https://github.com/JuliaGPU/CUDA.jl/issues/2191
+    @testset "preserving memory types" begin
+        a = CLVector{Int, cl.UnifiedSharedMemory}([1])
+        @test OpenCL.memtype(a) == cl.UnifiedSharedMemory
+
+        # unified-ness should be preserved
+        b = a .+ 1
+        @test OpenCL.memtype(b) == cl.UnifiedSharedMemory
+
+        # when there's a conflict, we should defer to unified memory
+        c = CLVector{Int, cl.UnifiedSharedMemory}([1])
+        d = CLVector{Int, cl.UnifiedDeviceMemory}([1])
+        e = c .+ d
+        @test OpenCL.memtype(e) == cl.UnifiedSharedMemory
     end
+end
 
-    @testset "resizing" begin
-        a = CLArray([1, 2, 3])
+@testset "resizing" begin
+    a = CLArray([1, 2, 3])
 
-        resize!(a, 3)
-        @test length(a) == 3
-        @test Array(a) == [1, 2, 3]
+    resize!(a, 3)
+    @test length(a) == 3
+    @test Array(a) == [1, 2, 3]
 
-        resize!(a, 5)
-        @test length(a) == 5
-        @test Array(a)[1:3] == [1, 2, 3]
+    resize!(a, 5)
+    @test length(a) == 5
+    @test Array(a)[1:3] == [1, 2, 3]
 
-        resize!(a, 2)
-        @test length(a) == 2
-        @test Array(a)[1:2] == [1, 2]
+    resize!(a, 2)
+    @test length(a) == 2
+    @test Array(a)[1:2] == [1, 2]
 
-        b = CLArray{Int}(undef, 0)
-        @test length(b) == 0
-        resize!(b, 1)
-        @test length(b) == 1
-    end
+    b = CLArray{Int}(undef, 0)
+    @test length(b) == 0
+    resize!(b, 1)
+    @test length(b) == 1
 end
diff --git a/test/atomics.jl b/test/atomics.jl
index ce69ca30..e29f0b29 100644
--- a/test/atomics.jl
+++ b/test/atomics.jl
@@ -1,7 +1,5 @@
 using SPIRVIntrinsics: @builtin_ccall, @typed_ccall, LLVMPtr
 
-@testset "atomics" begin
-
 function atomic_count(counter)
     OpenCL.@atomic counter[] += 1
     return
@@ -15,27 +13,25 @@ end
     end
 end
 
-    if "cl_ext_float_atomics" in cl.device().extensions
-        function atomic_float_add(counter, val)
-            @builtin_ccall(
-                "atomic_add", Float32,
-                (LLVMPtr{Float32, AS.CrossWorkgroup}, Float32),
-                pointer(counter), val,
-            )
-            return
-        end
+if "cl_ext_float_atomics" in cl.device().extensions
+    function atomic_float_add(counter, val)
+        @builtin_ccall(
+            "atomic_add", Float32,
+            (LLVMPtr{Float32, AS.CrossWorkgroup}, Float32),
+            pointer(counter), val,
+        )
+        return
+    end
 
-        @testset "SPV_EXT_shader_atomic_float_add extension" begin
-            a = OpenCL.zeros(Float32)
-            @opencl global_size = 1000 extensions = ["SPV_EXT_shader_atomic_float_add"] atomic_float_add(a, 1.0f0)
-            @test OpenCL.@allowscalar a[] == 1000.0f0
+    @testset "SPV_EXT_shader_atomic_float_add extension" begin
+        a = OpenCL.zeros(Float32)
+        @opencl global_size = 1000 extensions = ["SPV_EXT_shader_atomic_float_add"] atomic_float_add(a, 1.0f0)
+        @test OpenCL.@allowscalar a[] == 1000.0f0
 
-            spv = sprint() do io
-                OpenCL.code_native(io, atomic_float_add, Tuple{CLDeviceArray{Float32, 0, 1}, Float32}; extensions = ["SPV_EXT_shader_atomic_float_add"])
-            end
-            @test occursin("OpExtension \"SPV_EXT_shader_atomic_float_add\"", spv)
-            @test occursin("OpAtomicFAddEXT", spv)
+        spv = sprint() do io
+            OpenCL.code_native(io, atomic_float_add, Tuple{CLDeviceArray{Float32, 0, 1}, Float32}; extensions = ["SPV_EXT_shader_atomic_float_add"])
         end
+        @test occursin("OpExtension \"SPV_EXT_shader_atomic_float_add\"", spv)
+        @test occursin("OpAtomicFAddEXT", spv)
     end
-
 end
diff --git a/test/cmdqueue.jl b/test/cmdqueue.jl
index f31ed9b4..f2ff64b1 100644
--- a/test/cmdqueue.jl
+++ b/test/cmdqueue.jl
@@ -1,27 +1,25 @@
-@testset "CmdQueue" begin
-    @testset "constructor" begin
-        @test cl.CmdQueue() != nothing
-        @test cl.CmdQueue(:profile) != nothing
-        try
-            cl.CmdQueue(:out_of_order)
-            cl.CmdQueue((:profile, :out_of_order))
-        catch err
-            @warn("Platform $(cl.device().platform.name) does not seem to " *
-                  "suport out of order queues: \n$err",maxlog=1,
-                  exception=(err, catch_backtrace()))
-        end
-        @test_throws ArgumentError cl.CmdQueue(:unrecognized_flag)
-        for flag in [:profile, :out_of_order]
-            @test_throws ArgumentError cl.CmdQueue((flag, :unrecognized_flag))
-            @test_throws ArgumentError cl.CmdQueue((flag, flag))
-        end
+@testset "constructor" begin
+    @test cl.CmdQueue() != nothing
+    @test cl.CmdQueue(:profile) != nothing
+    try
+        cl.CmdQueue(:out_of_order)
+        cl.CmdQueue((:profile, :out_of_order))
+    catch err
+        @warn("Platform $(cl.device().platform.name) does not seem to " *
+              "suport out of order queues: \n$err",maxlog=1,
+              exception=(err, catch_backtrace()))
     end
-
-    @testset "info" begin
-        q = cl.CmdQueue()
-        @test q.context == cl.context()
-        @test q.device == cl.device()
-        @test q.reference_count > 0
-        @test typeof(q.properties) == cl.cl_command_queue_properties
+    @test_throws ArgumentError cl.CmdQueue(:unrecognized_flag)
+    for flag in [:profile, :out_of_order]
+        @test_throws ArgumentError cl.CmdQueue((flag, :unrecognized_flag))
+        @test_throws ArgumentError cl.CmdQueue((flag, flag))
     end
 end
+
+@testset "info" begin
+    q = cl.CmdQueue()
+    @test q.context == cl.context()
+    @test q.device == cl.device()
+    @test q.reference_count > 0
+    @test typeof(q.properties) == cl.cl_command_queue_properties
+end
diff --git a/test/context.jl b/test/context.jl
index bc5529cc..8d6bc620 100644
--- a/test/context.jl
+++ b/test/context.jl
@@ -1,87 +1,85 @@
-@testset "Context" begin
-    @testset "constructor" begin
-        @test_throws MethodError (cl.Context([]))
+@testset "constructor" begin
+    @test_throws MethodError (cl.Context([]))
 
-        ctx = cl.Context(cl.device())
-        @test ctx != nothing
-        @test ctx.reference_count == 1
-        ctx_id = pointer(ctx)
+    ctx = cl.Context(cl.device())
+    @test ctx != nothing
+    @test ctx.reference_count == 1
+    ctx_id = pointer(ctx)
 
-        ctx2 = cl.Context(ctx_id; retain=true)
-        @test ctx.reference_count == 2
-        finalize(ctx2)
-        @test ctx.reference_count == 1
+    ctx2 = cl.Context(ctx_id; retain=true)
+    @test ctx.reference_count == 2
+    finalize(ctx2)
+    @test ctx.reference_count == 1
 
-        # TODO: support switching contexts
-        #@testset "Context callback" begin
-        #    function context_test_callback(arg1, arg2, arg3)
-        #        # We're not really testing it because, nvidia doesn't seem to care about this functionality:
-        #        # https://devtalk.nvidia.com/default/topic/497433/context-callback-never-called/
-        #        OpenCL.cl.log_error("Callback works")
-        #        return
-        #    end
-        #
-        #    function create_context_error()
-        #        empty_kernel = "
-        #        __kernel void test() {
-        #            int c = 1 + 1;
-        #        };"
-        #        try
-        #            p = cl.Program(source = empty_kernel) |> cl.build!
-        #            k = cl.Kernel(p, "test")
-        #            cl.call(k; global_size=1, local_size=10000000)
-        #        catch
-        #        end
-        #    end
-        #
-        #    ctx = cl.Context(cl.device(), callback = context_test_callback)
-        #    context!(ctx) do
-        #        create_context_error()
-        #    end
-        #end
-    end
+    # TODO: support switching contexts
+    #@testset "Context callback" begin
+    #    function context_test_callback(arg1, arg2, arg3)
+    #        # We're not really testing it because, nvidia doesn't seem to care about this functionality:
+    #        # https://devtalk.nvidia.com/default/topic/497433/context-callback-never-called/
+    #        OpenCL.cl.log_error("Callback works")
+    #        return
+    #    end
+    #
+    #    function create_context_error()
+    #        empty_kernel = "
+    #        __kernel void test() {
+    #            int c = 1 + 1;
+    #        };"
+    #        try
+    #            p = cl.Program(source = empty_kernel) |> cl.build!
+    #            k = cl.Kernel(p, "test")
+    #            cl.call(k; global_size=1, local_size=10000000)
+    #        catch
+    #        end
+    #    end
+    #
+    #    ctx = cl.Context(cl.device(), callback = context_test_callback)
+    #    context!(ctx) do
+    #        create_context_error()
+    #    end
+    #end
+end
 
 
-    @testset "platform properties" begin
-        try
-            cl.Context(cl.CL_DEVICE_TYPE_CPU)
-        catch err
-            @test typeof(err) == cl.CLError
-            # CL_DEVICE_NOT_FOUND could be throw for GPU only drivers
-            @test err.desc in (:CL_INVALID_PLATFORM,
-                                     :CL_DEVICE_NOT_FOUND)
-        end
+@testset "platform properties" begin
+    try
+        cl.Context(cl.CL_DEVICE_TYPE_CPU)
+    catch err
+        @test typeof(err) == cl.CLError
+        # CL_DEVICE_NOT_FOUND could be throw for GPU only drivers
+        @test err.desc in (:CL_INVALID_PLATFORM,
+                                 :CL_DEVICE_NOT_FOUND)
+    end
 
-        properties = [(cl.CL_CONTEXT_PLATFORM, cl.platform())]
-        for (cl_dev_type, sym_dev_type) in [(cl.CL_DEVICE_TYPE_CPU, :cpu),
-                                            (cl.CL_DEVICE_TYPE_GPU, :gpu)]
-            if !cl.has_device_type(cl.platform(), sym_dev_type)
-                continue
-            end
-            @test cl.Context(sym_dev_type; properties) != nothing
-            @test cl.Context(cl_dev_type; properties) != nothing
-            ctx = cl.Context(cl_dev_type; properties)
-            @test !isempty(ctx.properties)
-            test_properties = ctx.properties
+    properties = [(cl.CL_CONTEXT_PLATFORM, cl.platform())]
+    for (cl_dev_type, sym_dev_type) in [(cl.CL_DEVICE_TYPE_CPU, :cpu),
+                                        (cl.CL_DEVICE_TYPE_GPU, :gpu)]
+        if !cl.has_device_type(cl.platform(), sym_dev_type)
+            continue
+        end
+        @test cl.Context(sym_dev_type; properties) != nothing
+        @test cl.Context(cl_dev_type; properties) != nothing
+        ctx = cl.Context(cl_dev_type; properties)
+        @test !isempty(ctx.properties)
+        test_properties = ctx.properties
 
-            @test test_properties == properties
+        @test test_properties == properties
 
-            platform_in_properties = false
-            for (t, v) in test_properties
-                if t == cl.CL_CONTEXT_PLATFORM
-                    @test v.name == cl.platform().name
-                    @test v == cl.platform()
-                    platform_in_properties = true
-                    break
-                end
+        platform_in_properties = false
+        for (t, v) in test_properties
+            if t == cl.CL_CONTEXT_PLATFORM
+                @test v.name == cl.platform().name
+                @test v == cl.platform()
+                platform_in_properties = true
+                break
             end
-            @test platform_in_properties
-        end
-        try
-            ctx2 = cl.Context(cl.CL_DEVICE_TYPE_ACCELERATOR; properties)
-        catch err
-            @test typeof(err) == cl.CLError
-            @test err.desc == :CL_DEVICE_NOT_FOUND
         end
+        @test platform_in_properties
+    end
+    try
+        ctx2 = cl.Context(cl.CL_DEVICE_TYPE_ACCELERATOR; properties)
+    catch err
+        @test typeof(err) == cl.CLError
+        @test err.desc == :CL_DEVICE_NOT_FOUND
     end
 end
diff --git a/test/device.jl b/test/device.jl
index 94441053..ff726a57 100644
--- a/test/device.jl
+++ b/test/device.jl
@@ -1,91 +1,89 @@
-@testset "Device" begin
-    @testset "Type" begin
-        for (t, k) in zip((cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_CPU,
-                           cl.CL_DEVICE_TYPE_ACCELERATOR, cl.CL_DEVICE_TYPE_ALL),
-                          (:gpu, :cpu, :accelerator, :all))
+@testset "Type" begin
+    for (t, k) in zip((cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_CPU,
+                       cl.CL_DEVICE_TYPE_ACCELERATOR, cl.CL_DEVICE_TYPE_ALL),
+                      (:gpu, :cpu, :accelerator, :all))
 
-            #for (dk, dt) in zip(cl.devices(cl.platform(), k), cl.devices(cl.platform(), t))
-            #    @fact dk == dt --> true
-            #end
-            #devices = cl.devices(cl.platform(), k)
-            #for device in devices
-            #    @fact device.device_type == t --> true
-            #end
-        end
+        #for (dk, dt) in zip(cl.devices(cl.platform(), k), cl.devices(cl.platform(), t))
+        #    @fact dk == dt --> true
+        #end
+        #devices = cl.devices(cl.platform(), k)
+        #for device in devices
+        #    @fact device.device_type == t --> true
+        #end
     end
+end
 
-    @testset "Equality" begin
-        devices = cl.devices(cl.platform())
+@testset "Equality" begin
+    devices = cl.devices(cl.platform())
 
-        if length(devices) > 1
-            d1 = devices[1]
-            for d2 in devices[2:end]
-               @test pointer(d2) != pointer(d1)
-               @test hash(d2) != hash(d1)
-               @test isequal(d2, d1) == false
-           end
+    if length(devices) > 1
+        d1 = devices[1]
+        for d2 in devices[2:end]
+           @test pointer(d2) != pointer(d1)
+           @test hash(d2) != hash(d1)
+           @test isequal(d2, d1) == false
        end
-    end
+   end
+end
 
-    @testset "Info" begin
-        device_info_keys = Symbol[
-                :driver_version,
-                :version,
-                :extensions,
-                :platform,
-                :name,
-                :device_type,
-                :has_image_support,
-                :vendor_id,
-                :max_compute_units,
-                :max_work_item_size,
-                :max_clock_frequency,
-                :address_bits,
-                :max_read_image_args,
-                :max_write_image_args,
-                :global_mem_size,
-                :max_mem_alloc_size,
-                :max_const_buffer_size,
-                :local_mem_size,
-                :has_local_mem,
-                :host_unified_memory,
-                :available,
-                :compiler_available,
-                :max_work_group_size,
-                :max_parameter_size,
-                :profiling_timer_resolution,
-                :max_image2d_shape,
-                :max_image3d_shape,
-            ]
-        @test isa(cl.platform(), cl.Platform)
-        @test_throws ErrorException cl.platform().zjdlkf
+@testset "Info" begin
+    device_info_keys = Symbol[
+            :driver_version,
+            :version,
+            :extensions,
+            :platform,
+            :name,
+            :device_type,
+            :has_image_support,
+            :vendor_id,
+            :max_compute_units,
+            :max_work_item_size,
+            :max_clock_frequency,
+            :address_bits,
+            :max_read_image_args,
+            :max_write_image_args,
+            :global_mem_size,
+            :max_mem_alloc_size,
+            :max_const_buffer_size,
+            :local_mem_size,
+            :has_local_mem,
+            :host_unified_memory,
+            :available,
+            :compiler_available,
+            :max_work_group_size,
+            :max_parameter_size,
+            :profiling_timer_resolution,
+            :max_image2d_shape,
+            :max_image3d_shape,
+        ]
+    @test isa(cl.platform(), cl.Platform)
+    @test_throws ErrorException cl.platform().zjdlkf
 
-        device = cl.device()
-        @test isa(device, cl.Device)
-        @test_throws ErrorException device.zjdlkf
-        for k in device_info_keys
-            v = getproperty(device, k)
-            if k == :extensions
-                @test isa(v, Array)
-                if length(v) > 0
-                    @test isa(v, Array{String, 1})
-                end
-            elseif k == :platform
-                @test v == cl.platform()
-            elseif k == :max_work_item_sizes
-                @test length(v) == 3
-            elseif k == :max_image2d_shape
-                @test length(v) == 2
-            elseif k == :max_image3d_shape
-                @test length(v) == 3
+    device = cl.device()
+    @test isa(device, cl.Device)
+    @test_throws ErrorException device.zjdlkf
+    for k in device_info_keys
+        v = getproperty(device, k)
+        if k == :extensions
+            @test isa(v, Array)
+            if length(v) > 0
+                @test isa(v, Array{String, 1})
             end
+        elseif k == :platform
+            @test v == cl.platform()
+        elseif k == :max_work_item_sizes
+            @test length(v) == 3
+        elseif k == :max_image2d_shape
+            @test length(v) == 2
+        elseif k == :max_image3d_shape
+            @test length(v) == 3
         end
+    end
 
-        @test cl.queue_properties(cl.device()).profiling isa Bool
-        @test cl.queue_properties(cl.device()).out_of_order_exec isa Bool
+    @test cl.queue_properties(cl.device()).profiling isa Bool
+    @test cl.queue_properties(cl.device()).out_of_order_exec isa Bool
 
-        @test cl.exec_capabilities(cl.device()).native_kernel isa Bool
+    @test cl.exec_capabilities(cl.device()).native_kernel isa Bool
 
-        @test cl.svm_capabilities(cl.device()).fine_grain_buffer isa Bool
-    end
+    @test cl.svm_capabilities(cl.device()).fine_grain_buffer isa Bool
 end
diff --git a/test/event.jl b/test/event.jl
index d3c437cb..55796fac 100644
--- a/test/event.jl
+++ b/test/event.jl
@@ -3,68 +3,68 @@ if contains(cl.platform().vendor, "Intel") || contains(cl.platform().vendor, "po
     # hangs on Intel
     @warn "Skipping event tests on $(cl.platform().name)"
 else
-@testset "Event" begin
-    @testset "status" begin
-        evt = cl.UserEvent()
-        evt.status
-        @test evt.status == :submitted
-        cl.complete(evt)
-        @test evt.status == :complete
-        finalize(evt)
-    end
 
-    @testset "wait" begin
-        # create user event
-        usr_evt = cl.UserEvent()
-        cl.enqueue_wait_for_events(usr_evt)
+@testset "status" begin
+    evt = cl.UserEvent()
+    evt.status
+    @test evt.status == :submitted
+    cl.complete(evt)
+    @test evt.status == :complete
+    finalize(evt)
+end
+
+@testset "wait" begin
+    # create user event
+    usr_evt = cl.UserEvent()
+    cl.enqueue_wait_for_events(usr_evt)
 
-        # create marker event
-        mkr_evt = cl.enqueue_marker()
+    # create marker event
+    mkr_evt = cl.enqueue_marker()
 
-        @test usr_evt.status == :submitted
-        @test mkr_evt.status in (:queued, :submitted)
+    @test usr_evt.status == :submitted
+    @test mkr_evt.status in (:queued, :submitted)
 
-        cl.complete(usr_evt)
-        @test usr_evt.status == :complete
+    cl.complete(usr_evt)
+    @test usr_evt.status == :complete
 
-        wait(mkr_evt)
-        @test mkr_evt.status == :complete
+    wait(mkr_evt)
+    @test mkr_evt.status == :complete
 
-        @test cl.cl_event_status(:running) == cl.CL_RUNNING
-        @test cl.cl_event_status(:submitted) == cl.CL_SUBMITTED
-        @test cl.cl_event_status(:queued) == cl.CL_QUEUED
-        @test cl.cl_event_status(:complete) == cl.CL_COMPLETE
-    end
+    @test cl.cl_event_status(:running) == cl.CL_RUNNING
+    @test cl.cl_event_status(:submitted) == cl.CL_SUBMITTED
+    @test cl.cl_event_status(:queued) == cl.CL_QUEUED
+    @test cl.cl_event_status(:complete) == cl.CL_COMPLETE
+end
 
-    @testset "callback" begin
-        global callback_called = Ref(false)
+@testset "callback" begin
+    global callback_called = Ref(false)
 
-        function test_callback(evt, status)
-            callback_called[] = true
-        end
+    function test_callback(evt, status)
+        callback_called[] = true
+    end
 
-        usr_evt = cl.UserEvent()
+    usr_evt = cl.UserEvent()
 
-        cl.enqueue_wait_for_events(usr_evt)
+    cl.enqueue_wait_for_events(usr_evt)
 
-        mkr_evt = cl.enqueue_marker()
-        cl.add_callback(mkr_evt, test_callback)
+    mkr_evt = cl.enqueue_marker()
+    cl.add_callback(mkr_evt, test_callback)
 
-        @test usr_evt.status == :submitted
-        @test mkr_evt.status in (:queued, :submitted)
-        @test !callback_called[]
+    @test usr_evt.status == :submitted
+    @test mkr_evt.status in (:queued, :submitted)
+    @test !callback_called[]
 
-        cl.complete(usr_evt)
-        @test usr_evt.status == :complete
+    cl.complete(usr_evt)
+    @test usr_evt.status == :complete
 
-        wait(mkr_evt)
+    wait(mkr_evt)
 
-        # Give callback some time to finish
-        yield()
-        sleep(0.5)
+    # Give callback some time to finish
+    yield()
+    sleep(0.5)
 
-        @test mkr_evt.status == :complete
-        @test callback_called[]
-    end
+    @test mkr_evt.status == :complete
+    @test callback_called[]
 end
+
 end
diff --git a/test/execution.jl b/test/execution.jl
index e15a4349..1fd69d7a 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -1,6 +1,5 @@
 using SPIRV_LLVM_Translator_jll
-
-@testset "execution" begin
+using IOCapture
 
 @testset "@opencl" begin
 
@@ -150,5 +149,3 @@ end
         @test occursin("target triple = \"spir64-unknown-unknown\"", llvm_backend_khronos)
     end
 end
-
-end
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index ad970c05..2f408fcb 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -12,8 +12,6 @@ end
 const float_types = filter(x -> x <: Base.IEEEFloat, GPUArraysTestSuite.supported_eltypes(CLArray))
 const ispocl = cl.platform().name == "Portable Computing Language"
 
-@testset "intrinsics" begin
-
 @testset "barrier" begin
 
 @on_device barrier(OpenCL.LOCAL_MEM_FENCE)
@@ -163,5 +161,3 @@ end
 end
 
 end
-
-end
diff --git a/test/kernel.jl b/test/kernel.jl
index e7033c8b..a938305b 100644
--- a/test/kernel.jl
+++ b/test/kernel.jl
@@ -1,163 +1,161 @@
-@testset "Kernel" begin
-    test_source = "
-    __kernel void sum(__global const float *a,
-                      __global const float *b,
-                      __global float *c,
-                      const unsigned int count)
-    {
-      unsigned int gid = get_global_id(0);
-      if (gid < count) {
-          c[gid] = a[gid] + b[gid];
-      }
-    }
-    "
-
-    #TODO: tests for invalid kernel build error && logs...
-
-    @testset "constructor" begin
-        prg = cl.Program(source=test_source)
-        @test_throws ArgumentError cl.Kernel(prg, "sum")
-        cl.build!(prg)
-        @test cl.Kernel(prg, "sum") != nothing
-    end
+test_source = "
+__kernel void sum(__global const float *a,
+                  __global const float *b,
+                  __global float *c,
+                  const unsigned int count)
+{
+  unsigned int gid = get_global_id(0);
+  if (gid < count) {
+      c[gid] = a[gid] + b[gid];
+  }
+}
+"
+
+#TODO: tests for invalid kernel build error && logs...
+
+@testset "constructor" begin
+    prg = cl.Program(source=test_source)
+    @test_throws ArgumentError cl.Kernel(prg, "sum")
+    cl.build!(prg)
+    @test cl.Kernel(prg, "sum") != nothing
+end
 
-    @testset "info" begin
-        prg = cl.Program(source=test_source)
-        cl.build!(prg)
-        k = cl.Kernel(prg, "sum")
-        @test k.function_name == "sum"
-        @test k.num_args == 4
-        @test k.reference_count > 0
-        @test k.program == prg
-        @test typeof(k.attributes) == String
-    end
+@testset "info" begin
+    prg = cl.Program(source=test_source)
+    cl.build!(prg)
+    k = cl.Kernel(prg, "sum")
+    @test k.function_name == "sum"
+    @test k.num_args == 4
+    @test k.reference_count > 0
+    @test k.program == prg
+    @test typeof(k.attributes) == String
+end
 
-    @testset "mem/workgroup size" begin
-        prg = cl.Program(source=test_source)
-        cl.build!(prg)
-        k = cl.Kernel(prg, "sum")
-        wginfo = cl.work_group_info(k, cl.device())
-        for sf in [:size, :compile_size, :local_mem_size, :private_mem_size, :prefered_size_multiple]
-            @test getproperty(wginfo, sf) != nothing
-        end
+@testset "mem/workgroup size" begin
+    prg = cl.Program(source=test_source)
+    cl.build!(prg)
+    k = cl.Kernel(prg, "sum")
+    wginfo = cl.work_group_info(k, cl.device())
+    for sf in [:size, :compile_size, :local_mem_size, :private_mem_size, :prefered_size_multiple]
+        @test getproperty(wginfo, sf) != nothing
     end
+end
 
-    @testset "set_arg!/set_args!" begin
-        prg = cl.Program(source=test_source) |> cl.build!
-        k = cl.Kernel(prg, "sum")
+@testset "set_arg!/set_args!" begin
+    prg = cl.Program(source=test_source) |> cl.build!
+    k = cl.Kernel(prg, "sum")
 
-        count  = 1024
-        nbytes = count * sizeof(Float32)
+    count  = 1024
+    nbytes = count * sizeof(Float32)
 
-        h_ones = ones(Float32, count)
+    h_ones = ones(Float32, count)
 
-        A = CLArray(h_ones)
-        B = CLArray(h_ones)
-        C = CLArray{Float32}(undef, count)
+    A = CLArray(h_ones)
+    B = CLArray(h_ones)
+    C = CLArray{Float32}(undef, count)
 
-        # we use julia's index by one convention
-        cl.set_arg!(k, 1, A.data[].mem)
-        cl.set_arg!(k, 2, B.data[].mem)
-        cl.set_arg!(k, 3, C.data[].mem)
-        cl.set_arg!(k, 4, UInt32(count))
+    # we use julia's index by one convention
+    cl.set_arg!(k, 1, A.data[].mem)
+    cl.set_arg!(k, 2, B.data[].mem)
+    cl.set_arg!(k, 3, C.data[].mem)
+    cl.set_arg!(k, 4, UInt32(count))
 
-        cl.enqueue_kernel(k, count) |> wait
-        r = Array(C)
+    cl.enqueue_kernel(k, count) |> wait
+    r = Array(C)
 
-        @test all(x -> x == 2.0, r)
-        cl.flush(cl.queue())
+    @test all(x -> x == 2.0, r)
+    cl.flush(cl.queue())
 
-        # test set_args with new kernel
-        k2 = cl.Kernel(prg, "sum")
-        cl.set_args!(k2, A.data[].mem, B.data[].mem, C.data[].mem, UInt32(count))
+    # test set_args with new kernel
+    k2 = cl.Kernel(prg, "sum")
+    cl.set_args!(k2, A.data[].mem, B.data[].mem, C.data[].mem, UInt32(count))
 
-        h_twos = fill(2f0, count)
-        copyto!(A, h_twos)
-        copyto!(B, h_twos)
+    h_twos = fill(2f0, count)
+    copyto!(A, h_twos)
+    copyto!(B, h_twos)
 
-        #TODO: check for ocl version, fill is opencl v1.2
-        #cl.enqueue_fill(A, 2f0)
-        #cl.enqueue_fill(B, 2f0)
+    #TODO: check for ocl version, fill is opencl v1.2
+    #cl.enqueue_fill(A, 2f0)
+    #cl.enqueue_fill(B, 2f0)
 
-        cl.enqueue_kernel(k, count)
+    cl.enqueue_kernel(k, count)
 
-        @test all(x -> x == 4.0, Array(C))
-    end
+    @test all(x -> x == 4.0, Array(C))
+end
 
-    @testset "clcall" begin
-        simple_kernel = "
-            __kernel void test(__global float *i) {
-                *i += 1;
-            };"
+@testset "clcall" begin
+    simple_kernel = "
+        __kernel void test(__global float *i) {
+            *i += 1;
+        };"
 
-        h_buff = Float32[1,]
-        d_arr = CLArray(h_buff)
+    h_buff = Float32[1,]
+    d_arr = CLArray(h_buff)
 
-        p = cl.Program(source=simple_kernel) |> cl.build!
-        k = cl.Kernel(p, "test")
+    p = cl.Program(source=simple_kernel) |> cl.build!
+    k = cl.Kernel(p, "test")
 
-        # dimensions must be the same size
-        @test_throws ArgumentError clcall(k, Tuple{CLPtr{Float32}}, d_arr;
-                                          global_size=(1,), local_size=(1,1))
-        @test_throws ArgumentError clcall(k, Tuple{CLPtr{Float32}}, d_arr;
-                                          global_size=(1,1), local_size=(1,))
+    # dimensions must be the same size
+    @test_throws ArgumentError clcall(k, Tuple{CLPtr{Float32}}, d_arr;
+                                      global_size=(1,), local_size=(1,1))
+    @test_throws ArgumentError clcall(k, Tuple{CLPtr{Float32}}, d_arr;
+                                      global_size=(1,1), local_size=(1,))
 
-        # dimensions are bounded
-        max_work_dim = cl.device().max_work_item_dims
-        bad = tuple([1 for _ in 1:(max_work_dim + 1)])
+    # dimensions are bounded
+    max_work_dim = cl.device().max_work_item_dims
+    bad = tuple([1 for _ in 1:(max_work_dim + 1)])
 
-        # calls are asynchronous, but cl.read blocks
-        clcall(k, Tuple{CLPtr{Float32}}, d_arr)
-        @test Array(d_arr) == [2f0]
+    # calls are asynchronous, but cl.read blocks
+    clcall(k, Tuple{CLPtr{Float32}}, d_arr)
+    @test Array(d_arr) == [2f0]
 
-        # enqueue task is an alias for calling
-        # a kernel with a global/local size of 1
-        evt = cl.enqueue_task(k)
-        @test Array(d_arr) == [3f0]
-    end
+    # enqueue task is an alias for calling
+    # a kernel with a global/local size of 1
+    evt = cl.enqueue_task(k)
+    @test Array(d_arr) == [3f0]
+end
 
-    @testset "packed structures" begin
-        test_source = "
-        struct __attribute__((packed)) Test2{
-            long f1;
-            int __attribute__((aligned (8))) f2;
-        };
-        __kernel void structest(__global float *out, struct Test2 b){
-            out[0] = b.f1;
-            out[1] = b.f2;
-        }
-        "
-        prg = cl.Program(source = test_source)
-        cl.build!(prg)
-        structkernel = cl.Kernel(prg, "structest")
-        out = CLArray{Float32}(undef, 2)
-        bstruct = (1, Int32(4))
-        clcall(structkernel, Tuple{CLPtr{Float32}, Tuple{Int64, Cint}}, out, bstruct)
-        @test Array(out) == [1f0, 4f0]
-    end
+@testset "packed structures" begin
+    test_source = "
+    struct __attribute__((packed)) Test2{
+        long f1;
+        int __attribute__((aligned (8))) f2;
+    };
+    __kernel void structest(__global float *out, struct Test2 b){
+        out[0] = b.f1;
+        out[1] = b.f2;
+    }
+    "
+    prg = cl.Program(source = test_source)
+    cl.build!(prg)
+    structkernel = cl.Kernel(prg, "structest")
+    out = CLArray{Float32}(undef, 2)
+    bstruct = (1, Int32(4))
+    clcall(structkernel, Tuple{CLPtr{Float32}, Tuple{Int64, Cint}}, out, bstruct)
+    @test Array(out) == [1f0, 4f0]
+end
 
-    @testset "vector arguments" begin
-        test_source = "
-        __kernel void vec3_unpack(__global float *out, float3 a, float3 b) {
-            out[0] = a.x;
-            out[1] = a.y;
-            out[2] = a.z;
-            out[3] = b.x;
-            out[4] = b.y;
-            out[5] = b.z;
-        }
-        "
-        prg = cl.Program(source = test_source)
-        cl.build!(prg)
-        vec3kernel = cl.Kernel(prg, "vec3_unpack")
-        out = CLArray{Float32}(undef, 6)
-        # NOTE: the user is responsible for padding the vector to 4 elements
-        #       (only on some platforms)
-        vec3_a = (1f0, 2f0, 3f0, 0f0)
-        vec3_b = (4f0, 5f0, 6f0, 0f0)
-        clcall(
-            vec3kernel, Tuple{CLPtr{Float32}, NTuple{4, Float32}, NTuple{4, Float32}},
-                           out, vec3_a, vec3_b)
-        @test Array(out) == [1f0, 2f0, 3f0, 4f0, 5f0, 6f0]
-    end
+@testset "vector arguments" begin
+    test_source = "
+    __kernel void vec3_unpack(__global float *out, float3 a, float3 b) {
+        out[0] = a.x;
+        out[1] = a.y;
+        out[2] = a.z;
+        out[3] = b.x;
+        out[4] = b.y;
+        out[5] = b.z;
+    }
+    "
+    prg = cl.Program(source = test_source)
+    cl.build!(prg)
+    vec3kernel = cl.Kernel(prg, "vec3_unpack")
+    out = CLArray{Float32}(undef, 6)
+    # NOTE: the user is responsible for padding the vector to 4 elements
+    #       (only on some platforms)
+    vec3_a = (1f0, 2f0, 3f0, 0f0)
+    vec3_b = (4f0, 5f0, 6f0, 0f0)
+    clcall(
+        vec3kernel, Tuple{CLPtr{Float32}, NTuple{4, Float32}, NTuple{4, Float32}},
+                       out, vec3_a, vec3_b)
+    @test Array(out) == [1f0, 2f0, 3f0, 4f0, 5f0, 6f0]
 end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index debf5dec..97e54154 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -1,3 +1,17 @@
+# KernelAbstractions has a testsuite that isn't part of the main package.
+# Include it directly.
+
+const KATestSuite = let
+    mod = @eval module $(gensym())
+        using ..Test
+        import KernelAbstractions
+        kernelabstractions = pathof(KernelAbstractions)
+        kernelabstractions_root = dirname(dirname(kernelabstractions))
+        include(joinpath(kernelabstractions_root, "test", "testsuite.jl"))
+    end
+    mod.Testsuite
+end
+
 skip_tests=Set([
     "sparse",
     "Convert", # Need to opt out of i128
diff --git a/test/memory.jl b/test/memory.jl
index 6adc081c..f352fd73 100644
--- a/test/memory.jl
+++ b/test/memory.jl
@@ -1,32 +1,30 @@
-@testset "Memory" begin
-    function create_test_buffer()
-        testarray = zeros(Float32, 1000)
-        cl.Buffer(testarray)
-    end
+function create_test_buffer()
+    testarray = zeros(Float32, 1000)
+    cl.Buffer(testarray)
+end
 
-    @testset "context" begin
-        buf = create_test_buffer()
+@testset "context" begin
+    buf = create_test_buffer()
 
-        ctx = cl.context(buf)
+    ctx = cl.context(buf)
 
-        @test ctx != nothing
-        @test isequal(ctx, cl.context()) != nothing
-    end
+    @test ctx != nothing
+    @test isequal(ctx, cl.context()) != nothing
+end
 
-    @testset "properties" begin
-        buf = create_test_buffer()
+@testset "properties" begin
+    buf = create_test_buffer()
 
-        expectations = [
-            (:type, cl.CL_MEM_OBJECT_BUFFER),
-            (:flags, (:rw, :copy)),
-            (:size, sizeof(buf)),
-            (:reference_count, 1),
-            (:map_count, 0)
-        ]
+    expectations = [
+        (:type, cl.CL_MEM_OBJECT_BUFFER),
+        (:flags, (:rw, :copy)),
+        (:size, sizeof(buf)),
+        (:reference_count, 1),
+        (:map_count, 0)
+    ]
 
-        for expectation in expectations
-            prop, value = expectation
-            @test getproperty(buf, prop) == value
-        end
+    for expectation in expectations
+        prop, value = expectation
+        @test getproperty(buf, prop) == value
     end
 end
diff --git a/test/platform.jl b/test/platform.jl
index 5e1c95f4..b6f410d6 100644
--- a/test/platform.jl
+++ b/test/platform.jl
@@ -1,27 +1,25 @@
-@testset "Platform" begin
-    @testset "Info" begin
-        @test length(cl.platforms()) == cl.num_platforms()
+@testset "Info" begin
+    @test length(cl.platforms()) == cl.num_platforms()
 
-        @test cl.platform() != nothing
-        @test pointer(cl.platform()) != C_NULL
-        @test cl.platform().opencl_version isa VersionNumber
-    end
+    @test cl.platform() != nothing
+    @test pointer(cl.platform()) != C_NULL
+    @test cl.platform().opencl_version isa VersionNumber
+end
 
-    @testset "Equality" begin
-        platform       = cl.platforms()[1]
-        platform_copy  = cl.platforms()[1]
+@testset "Equality" begin
+    platform       = cl.platforms()[1]
+    platform_copy  = cl.platforms()[1]
 
-        @test pointer(platform) == pointer(platform_copy)
-        @test hash(platform) == hash(platform_copy)
-        @test isequal(platform, platform)
+    @test pointer(platform) == pointer(platform_copy)
+    @test hash(platform) == hash(platform_copy)
+    @test isequal(platform, platform)
 
-        if length(cl.platforms()) > 1
-            p1 = cl.platforms()[1]
-            for p2 in cl.platforms()[2:end]
-                @test pointer(p2) != pointer(p1)
-                @test hash(p2) != hash(p1)
-                @test !isequal(p2, p1)
-            end
+    if length(cl.platforms()) > 1
+        p1 = cl.platforms()[1]
+        for p2 in cl.platforms()[2:end]
+            @test pointer(p2) != pointer(p1)
+            @test hash(p2) != hash(p1)
+            @test !isequal(p2, p1)
         end
     end
 end
diff --git a/test/program.jl b/test/program.jl
index 01c62839..b7d433b4 100644
--- a/test/program.jl
+++ b/test/program.jl
@@ -1,70 +1,68 @@
-@testset "Program" begin
-    let
-        @test_throws ArgumentError cl.Program()
-        @test_throws ArgumentError cl.Program(source="", il="")
-    end
+let
+    @test_throws ArgumentError cl.Program()
+    @test_throws ArgumentError cl.Program(source="", il="")
+end
 
-    test_source = "
-    __kernel void sum(__global const float *a,
-                      __global const float *b,
-                      __global float *c)
-    {
-      uint gid = get_global_id(0);
-      c[gid] = a[gid] + b[gid];
-    }
-    "
+test_source = "
+__kernel void sum(__global const float *a,
+                  __global const float *b,
+                  __global float *c)
+{
+  uint gid = get_global_id(0);
+  c[gid] = a[gid] + b[gid];
+}
+"
 
-    function create_test_program()
-        cl.Program(source=test_source)
-    end
+function create_test_program()
+    cl.Program(source=test_source)
+end
 
-    @testset "source constructor" begin
-        prg = cl.Program(source=test_source)
-        @test prg != nothing
-    end
-    @testset "info" begin
-        prg = cl.Program(source=test_source)
+@testset "source constructor" begin
+    prg = cl.Program(source=test_source)
+    @test prg != nothing
+end
+@testset "info" begin
+    prg = cl.Program(source=test_source)
 
-        @test prg.context == cl.context()
+    @test prg.context == cl.context()
 
-        @test typeof(prg.devices) == Vector{cl.Device}
-        @test length(prg.devices) > 0
-        @test cl.device() in prg.devices
+    @test typeof(prg.devices) == Vector{cl.Device}
+    @test length(prg.devices) > 0
+    @test cl.device() in prg.devices
 
-        @test typeof(prg.source) == String
-        @test prg.source == test_source
+    @test typeof(prg.source) == String
+    @test prg.source == test_source
 
-        @test prg.reference_count > 0
-        @test isempty(strip(prg.build_log[cl.device()]))
-    end
+    @test prg.reference_count > 0
+    @test isempty(strip(prg.build_log[cl.device()]))
+end
 
-    @testset "build" begin
-        prg = cl.Program(source=test_source)
-        @test cl.build!(prg) != nothing
+@testset "build" begin
+    prg = cl.Program(source=test_source)
+    @test cl.build!(prg) != nothing
 
-        @test prg.build_status[cl.device()] == cl.CL_BUILD_SUCCESS
-        @test prg.build_log[cl.device()] isa String
-    end
+    @test prg.build_status[cl.device()] == cl.CL_BUILD_SUCCESS
+    @test prg.build_log[cl.device()] isa String
+end
 
-    @testset "source code" begin
-       prg = cl.Program(source=test_source)
-       @test prg.source == test_source
-    end
+@testset "source code" begin
+   prg = cl.Program(source=test_source)
+   @test prg.source == test_source
+end
 
-    if contains(cl.platform().vendor, "pocl")
-        @warn "Skipping binary program tests on $(cl.platform().name)"
-    else
-        @testset "binaries" begin
-            prg = cl.Program(source=test_source) |> cl.build!
+if contains(cl.platform().vendor, "pocl")
+    @warn "Skipping binary program tests on $(cl.platform().name)"
+else
+    @testset "binaries" begin
+        prg = cl.Program(source=test_source) |> cl.build!
 
-            @test cl.device() in collect(keys(prg.binaries))
-            binaries = prg.binaries
-            @test cl.device() in collect(keys(binaries))
-            @test binaries[cl.device()] != nothing
-            @test length(binaries[cl.device()]) > 0
-            prg2 = cl.Program(binaries=binaries)
-            @test prg2.binaries == binaries
-            @test prg2.source === nothing
-        end
+        @test cl.device() in collect(keys(prg.binaries))
+        binaries = prg.binaries
+        @test cl.device() in collect(keys(binaries))
+        @test binaries[cl.device()] != nothing
+        @test length(binaries[cl.device()]) > 0
+        prg2 = cl.Program(binaries=binaries)
+        @test prg2.binaries == binaries
+        @test prg2.source === nothing
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 6fd437f2..2e04653d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,413 +1,152 @@
-using Distributed
-using Dates
-import REPL
-using Printf: @sprintf
-using Base.Filesystem: path_separator
+using ParallelTestRunner
 using Preferences
+import OpenCL, pocl_jll
+import Test
 
-# parse some command-line arguments
-function extract_flag!(args, flag, default=nothing)
-    for f in args
-        if startswith(f, flag)
-            # Check if it's just `--flag` or if it's `--flag=foo`
-            if f != flag
-                val = split(f, '=')[2]
-                if default !== nothing && !(typeof(default) <: AbstractString)
-                  val = parse(typeof(default), val)
-                end
-            else
-                val = default
-            end
-
-            # Drop this value from our args
-            filter!(x -> x != f, args)
-            return (true, val)
-        end
-    end
-    return (false, default)
-end
-do_help, _ = extract_flag!(ARGS, "--help")
-if do_help
-    println("""
-        Usage: runtests.jl [--help] [--list] [--jobs=N] [TESTS...]
-
-               --help             Show this text.
-               --list             List all available tests.
-               --verbose          Print more information during testing.
-               --quickfail        Fail the entire run as soon as a single test errored.
-               --jobs=N           Launch `N` processes to perform tests (default: Sys.CPU_THREADS).
-               --platform=NAME    Run tests on the platform named `NAME` (default: all platforms).
-
-               Remaining arguments filter the tests that will be executed.""")
-    exit(0)
-end
-_, jobs = extract_flag!(ARGS, "--jobs", Sys.CPU_THREADS)
-do_verbose, _ = extract_flag!(ARGS, "--verbose")
-do_quickfail, _ = extract_flag!(ARGS, "--quickfail")
-
-include("setup.jl")     # make sure everything is precompiled
 @info "System information:\n" * sprint(io->OpenCL.versioninfo(io))
 
-@info "Running $jobs tests in parallel. If this is too many, specify the `--jobs` argument to the tests, or set the JULIA_CPU_THREADS environment variable."
-
-# choose tests
-const tests = []
-const test_runners = Dict()
-## files in the test folder
-for (rootpath, dirs, files) in walkdir(@__DIR__)
-  # find Julia files
-  filter!(files) do file
-    endswith(file, ".jl") && file !== "setup.jl" && file !== "runtests.jl"
-  end
-  isempty(files) && continue
-
-  # strip extension
-  files = map(files) do file
-    file[1:end-3]
-  end
-
-  # prepend subdir
-  subdir = relpath(rootpath, @__DIR__)
-  if subdir != "."
-    files = map(files) do file
-      joinpath(subdir, file)
+## custom arguments
+args = parse_args(ARGS; custom=["platform"])
+
+# determine tests to run
+const testsuite = find_tests(pwd())
+## GPUArrays test suite: not part of the main package
+const GPUArraysTestSuite = let
+    mod = @eval module $(gensym())
+        using ..Test
+        import GPUArrays
+        gpuarrays = pathof(GPUArrays)
+        gpuarrays_root = dirname(dirname(gpuarrays))
+        include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
     end
-  end
-
-  # unify path separators
-  files = map(files) do file
-    replace(file, path_separator => '/')
-  end
-
-  append!(tests, files)
-  for file in files
-    test_runners[file] = ()->include("$(@__DIR__)/$file.jl")
-  end
+    mod.TestSuite
 end
-sort!(tests; by=(file)->stat("$(@__DIR__)/$file.jl").size, rev=true)
-## GPUArrays testsuite
 for name in keys(GPUArraysTestSuite.tests)
-    push!(tests, "gpuarrays/$name")
-    test_runners["gpuarrays/$name"] = ()->GPUArraysTestSuite.tests[name](CLArray)
-end
-## finalize
-unique!(tests)
-
-# parse some more command-line arguments
-## --list to list all available tests
-do_list, _ = extract_flag!(ARGS, "--list")
-if do_list
-    println("Available tests:")
-    for test in sort(tests)
-        println(" - $test")
-    end
-    exit(0)
+    test = "gpuarrays/$name"
+    testsuite[test] = :(GPUArraysTestSuite.tests[$name](CLArray))
 end
-## --platform selector
-do_platform, platform = extract_flag!(ARGS, "--platform", nothing)
-## no options should remain
-optlike_args = filter(startswith("-"), ARGS)
-if !isempty(optlike_args)
-    error("Unknown test options `$(join(optlike_args, " "))` (try `--help` for usage instructions)")
-end
-## the remaining args filter tests
-if isempty(ARGS)
-  # default to running all tests, except:
-  filter!(tests) do test
-    if load_preference(OpenCL, "default_memory_backend") == "svm" &&
-       test == "gpuarrays/indexing scalar"
+## filter
+if filter_tests!(testsuite, args)
+    if load_preference(OpenCL, "default_memory_backend") == "svm"
         # GPUArrays' scalar indexing tests assume that indexing is not supported
+        delete!(testsuite, "gpuarrays/indexing scalar")
         return false
     end
-
-    return true
-  end
-else
-  filter!(tests) do test
-    any(arg->startswith(test, arg), ARGS)
-  end
 end
 
-# add workers
-const test_exeflags = Base.julia_cmd()
-filter!(test_exeflags.exec) do c
-    return !(startswith(c, "--depwarn") || startswith(c, "--check-bounds"))
-end
-push!(test_exeflags.exec, "--check-bounds=yes")
-push!(test_exeflags.exec, "--startup-file=no")
-push!(test_exeflags.exec, "--depwarn=yes")
-push!(test_exeflags.exec, "--project=$(Base.active_project())")
-const test_exename = popfirst!(test_exeflags.exec)
-function addworker(X; kwargs...)
-    withenv("JULIA_NUM_THREADS" => 1, "OPENBLAS_NUM_THREADS" => 1) do
-        procs = addprocs(X; exename=test_exename, exeflags=test_exeflags, kwargs...)
-        @everywhere procs include($(joinpath(@__DIR__, "setup.jl")))
-        procs
+# wrap tests in device loops
+function generate_test(test, expr)
+    # some tests require native execution capabilities
+    requires_il = test in ["atomics", "execution", "intrinsics", "kernelabstractions",
+                           "statistics", "linalg", ] ||
+                  startswith(test, "gpuarrays/")
+
+    # targets is a global variable that is defined in init_code
+    return quote
+        if isempty(targets)
+            platform_filter = $(args.custom["platform"])
+            for platform in cl.platforms(),
+                device in cl.devices(platform)
+                if platform_filter !== nothing
+                    # filter on the name or vendor
+                    names = lowercase.([platform.name, platform.vendor])
+                    if !any(contains(platform_filter.value), names)
+                        continue
+                    end
+                end
+                push!(targets, (; platform, device))
+            end
+            if isempty(targets)
+                if platform_filter !== nothing
+                    throw(ArgumentError("No OpenCL platforms found"))
+                else
+                    throw(ArgumentError("No OpenCL platforms found matching $(platform_filter.value)"))
+                end
+            end
+        end
+
+        @testset "$(device.name)" for (; platform, device) in targets
+            cl.platform!(platform)
+            cl.device!(device)
+
+            if !$(requires_il) || "cl_khr_il_program" in device.extensions
+                $(expr)
+            end
+        end
     end
 end
-addworker(min(jobs, length(tests)))
-
-# pretty print information about gc and mem usage
-testgroupheader = "Test"
-workerheader = "(Worker)"
-name_align        = maximum([textwidth(testgroupheader) + textwidth(" ") +
-                             textwidth(workerheader); map(x -> textwidth(x) +
-                             3 + ndigits(nworkers()), tests)])
-elapsed_align     = textwidth("Time (s)")
-gc_align      = textwidth("GC (s)")
-percent_align = textwidth("GC %")
-alloc_align   = textwidth("Alloc (MB)")
-rss_align     = textwidth("RSS (MB)")
-printstyled(" "^(name_align + textwidth(testgroupheader) - 3), " | ")
-printstyled("         | ---------------- CPU ---------------- |\n", color=:white)
-printstyled(testgroupheader, color=:white)
-printstyled(lpad(workerheader, name_align - textwidth(testgroupheader) + 1), " | ", color=:white)
-printstyled("Time (s) | GC (s) | GC % | Alloc (MB) | RSS (MB) |\n", color=:white)
-print_lock = stdout isa Base.LibuvStream ? stdout.lock : ReentrantLock()
-if stderr isa Base.LibuvStream
-    stderr.lock = print_lock
+for test in keys(testsuite)
+    testsuite[test] = generate_test(test, testsuite[test])
 end
-function print_testworker_stats(test, wrkr, resp)
-    @nospecialize resp
-    lock(print_lock)
-    try
-        printstyled(test, color=:white)
-        printstyled(lpad("($wrkr)", name_align - textwidth(test) + 1, " "), " | ", color=:white)
-        time_str = @sprintf("%7.2f",resp[2])
-        printstyled(lpad(time_str, elapsed_align, " "), " | ", color=:white)
 
-        cpu_gc_str = @sprintf("%5.2f", resp[4])
-        printstyled(lpad(cpu_gc_str, gc_align, " "), " | ", color=:white)
-        # since there may be quite a few digits in the percentage,
-        # the left-padding here is less to make sure everything fits
-        cpu_percent_str = @sprintf("%4.1f", 100 * resp[4] / resp[2])
-        printstyled(lpad(cpu_percent_str, percent_align, " "), " | ", color=:white)
-        cpu_alloc_str = @sprintf("%5.2f", resp[3] / 2^20)
-        printstyled(lpad(cpu_alloc_str, alloc_align, " "), " | ", color=:white)
+const init_code = quote
+    using OpenCL, pocl_jll
 
-        cpu_rss_str = @sprintf("%5.2f", resp[6] / 2^20)
-        printstyled(lpad(cpu_rss_str, rss_align, " "), " |\n", color=:white)
-    finally
-        unlock(print_lock)
-    end
-end
-global print_testworker_started = (name, wrkr)->begin
-    if do_verbose
-        lock(print_lock)
-        try
-            printstyled(name, color=:white)
-            printstyled(lpad("($wrkr)", name_align - textwidth(name) + 1, " "), " |",
-                " "^elapsed_align, "started at $(now())\n", color=:white)
-        finally
-            unlock(print_lock)
+    OpenCL.allowscalar(false)
+    const targets = []
+
+    # GPUArrays has a testsuite that isn't part of the main package.
+    # Include it directly.
+    const GPUArraysTestSuite = let
+        mod = @eval module $(gensym())
+            using ..Test
+            import GPUArrays
+            gpuarrays = pathof(GPUArrays)
+            gpuarrays_root = dirname(dirname(gpuarrays))
+            include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
         end
+        mod.TestSuite
     end
-end
-function print_testworker_errored(name, wrkr)
-    lock(print_lock)
-    try
-        printstyled(name, color=:red)
-        printstyled(lpad("($wrkr)", name_align - textwidth(name) + 1, " "), " |",
-            " "^elapsed_align, " failed at $(now())\n", color=:red)
-    finally
-        unlock(print_lock)
-    end
-end
 
-# run tasks
-t0 = now()
-results = []
-all_tasks = Task[]
-all_tests = copy(tests)
-try
-    # Monitor stdin and kill this task on ^C
-    # but don't do this on Windows, because it may deadlock in the kernel
-    t = current_task()
-    running_tests = Dict{String, DateTime}()
-    if !Sys.iswindows() && isa(stdin, Base.TTY)
-        stdin_monitor = @async begin
-            term = REPL.Terminals.TTYTerminal("xterm", stdin, stdout, stderr)
-            try
-                REPL.Terminals.raw!(term, true)
-                while true
-                    c = read(term, Char)
-                    if c == '\x3'
-                        Base.throwto(t, InterruptException())
-                        break
-                    elseif c == '?'
-                        println("Currently running: ")
-                        tests = sort(collect(running_tests), by=x->x[2])
-                        foreach(tests) do (test, date)
-                            println(test, " (running for ", round(now()-date, Minute), ")")
-                        end
-                    end
-                end
-            catch e
-                isa(e, InterruptException) || rethrow()
-            finally
-                REPL.Terminals.raw!(term, false)
+    const device_eltypes = Dict()
+    function GPUArraysTestSuite.supported_eltypes(::Type{<:CLArray})
+        get!(device_eltypes, cl.device()) do
+            types = [Int16, Int32, Int64,
+                    Complex{Int16}, Complex{Int32}, Complex{Int64},
+                    Float32, ComplexF32]
+            if "cl_khr_fp64" in cl.device().extensions
+                push!(types, Float64)
+                push!(types, ComplexF64)
             end
+            if "cl_khr_fp16" in cl.device().extensions
+                push!(types, Float16)
+                push!(types, ComplexF16)
+            end
+            return types
         end
     end
-    @sync begin
-        function recycle_worker(p)
-            rmprocs(p, waitfor=30)
-            return nothing
-        end
-
-        for p in workers()
-            @async begin
-                push!(all_tasks, current_task())
-                while length(tests) > 0
-                    test = popfirst!(tests)
-
-                    # sometimes a worker failed, and we need to spawn a new one
-                    if p === nothing
-                        p = addworker(1)[1]
-                    end
-                    wrkr = p
-
-                    local resp
 
-                    # run the test
-                    running_tests[test] = now()
-                    try
-                        resp = remotecall_fetch(runtests, wrkr,
-                                                test_runners[test], test,
-                                                platform)
-                    catch e
-                        isa(e, InterruptException) && return
-                        resp = Any[e]
-                    end
-                    delete!(running_tests, test)
-                    push!(results, (test, resp))
+    testf(f, xs...; kwargs...) = GPUArraysTestSuite.compare(f, CLArray, xs...; kwargs...)
 
-                    # act on the results
-                    if resp[1] isa Exception
-                        print_testworker_errored(test, wrkr)
-                        do_quickfail && Base.throwto(t, InterruptException())
+    ## auxiliary stuff
 
-                        # the worker encountered some failure, recycle it
-                        # so future tests get a fresh environment
-                        p = recycle_worker(p)
-                    else
-                        print_testworker_stats(test, wrkr, resp)
+    # Run some code on-device
+    macro on_device(ex...)
+        code = ex[end]
+        kwargs = ex[1:end-1]
 
-                        compilations = resp[7]
-                        if Sys.iswindows() && compilations > 100
-                            # XXX: restart to avoid handle exhaustion
-                            #      (see pocl/pocl#1941)
-                            @warn "Restarting worker $wrkr to avoid handle exhaustion"
-                            p = recycle_worker(p)
-                        end
-                    end
+        @gensym kernel
+        esc(quote
+            let
+                function $kernel()
+                    $code
+                    return
                 end
 
-                if p !== nothing
-                    recycle_worker(p)
-                end
-            end
-        end
-    end
-catch e
-    isa(e, InterruptException) || rethrow()
-    # If the test suite was merely interrupted, still print the
-    # summary, which can be useful to diagnose what's going on
-    foreach(task -> begin
-            istaskstarted(task) || return
-            istaskdone(task) && return
-            try
-                schedule(task, InterruptException(); error=true)
-            catch ex
-                @error "InterruptException" exception=ex,catch_backtrace()
+                @opencl $(kwargs...) $kernel()
+                cl.finish(cl.queue())
             end
-        end, all_tasks)
-    for t in all_tasks
-        # NOTE: we can't just wait, but need to discard the exception,
-        #       because the throwto for --quickfail also kills the worker.
-        try
-            wait(t)
-        catch e
-            showerror(stderr, e)
-        end
-    end
-finally
-    if @isdefined stdin_monitor
-        schedule(stdin_monitor, InterruptException(); error=true)
+        end)
     end
 end
-t1 = now()
-elapsed = canonicalize(Dates.CompoundPeriod(t1-t0))
-println("Testing finished in $elapsed")
 
-# construct a testset to render the test results
-o_ts = Test.DefaultTestSet("Overall")
-Test.push_testset(o_ts)
-completed_tests = Set{String}()
-for (testname, (resp,)) in results
-    push!(completed_tests, testname)
-    if isa(resp, Test.DefaultTestSet)
-        Test.push_testset(resp)
-        Test.record(o_ts, resp)
-        Test.pop_testset()
-    elseif isa(resp, Tuple{Int,Int})
-        fake = Test.DefaultTestSet(testname)
-        for i in 1:resp[1]
-            Test.record(fake, Test.Pass(:test, nothing, nothing, nothing, nothing))
-        end
-        for i in 1:resp[2]
-            Test.record(fake, Test.Broken(:test, nothing))
-        end
-        Test.push_testset(fake)
-        Test.record(o_ts, fake)
-        Test.pop_testset()
-    elseif isa(resp, RemoteException) && isa(resp.captured.ex, Test.TestSetException)
-        println("Worker $(resp.pid) failed running test $(testname):")
-        Base.showerror(stdout, resp.captured)
-        println()
-        fake = Test.DefaultTestSet(testname)
-        for i in 1:resp.captured.ex.pass
-            Test.record(fake, Test.Pass(:test, nothing, nothing, nothing, nothing))
-        end
-        for i in 1:resp.captured.ex.broken
-            Test.record(fake, Test.Broken(:test, nothing))
-        end
-        for t in resp.captured.ex.errors_and_fails
-            Test.record(fake, t)
-        end
-        Test.push_testset(fake)
-        Test.record(o_ts, fake)
-        Test.pop_testset()
+# avoid handle exhaustion on Windows by running each test in a separate process (pocl/pocl#1941)
+function test_worker(test)
+    if Sys.iswindows()
+        addworker()
     else
-        if !isa(resp, Exception)
-            resp = ErrorException(string("Unknown result type : ", typeof(resp)))
-        end
-        # If this test raised an exception that is not a remote testset exception,
-        # i.e. not a RemoteException capturing a TestSetException that means
-        # the test runner itself had some problem, so we may have hit a segfault,
-        # deserialization errors or something similar.  Record this testset as Errored.
-        fake = Test.DefaultTestSet(testname)
-        Test.record(fake, Test.Error(:nontest_error, testname, nothing, Any[(resp, [])], LineNumberNode(1)))
-        Test.push_testset(fake)
-        Test.record(o_ts, fake)
-        Test.pop_testset()
+        nothing
     end
 end
-for test in all_tests
-    (test in completed_tests) && continue
-    fake = Test.DefaultTestSet(test)
-    Test.record(fake, Test.Error(:test_interrupted, test, nothing,
-                                    [("skipped", [])], LineNumberNode(1)))
-    Test.push_testset(fake)
-    Test.record(o_ts, fake)
-    Test.pop_testset()
-end
-println()
-Test.print_test_results(o_ts, 1)
-if !o_ts.anynonpass
-    println("    \033[32;1mSUCCESS\033[0m")
-else
-    println("    \033[31;1mFAILURE\033[0m\n")
-    Test.print_test_errors(o_ts)
-    throw(Test.FallbackTestSetException("Test run finished with errors"))
-end
+
+runtests(OpenCL, args; testsuite, init_code, test_worker)
diff --git a/test/setup.jl b/test/setup.jl
deleted file mode 100644
index 066424c6..00000000
--- a/test/setup.jl
+++ /dev/null
@@ -1,166 +0,0 @@
-using Distributed, Test
-using OpenCL, pocl_jll
-using IOCapture
-
-# KernelAbstractions has a testsuite that isn't part of the main package.
-# Include it directly.
-const KATestSuite = let
-    mod = @eval module $(gensym())
-        using ..Test
-        import KernelAbstractions
-        kernelabstractions = pathof(KernelAbstractions)
-        kernelabstractions_root = dirname(dirname(kernelabstractions))
-        include(joinpath(kernelabstractions_root, "test", "testsuite.jl"))
-    end
-    mod.Testsuite
-end
-
-# GPUArrays has a testsuite that isn't part of the main package.
-# Include it directly.
-const GPUArraysTestSuite = let
-    mod = @eval module $(gensym())
-        using ..Test
-        import GPUArrays
-        gpuarrays = pathof(GPUArrays)
-        gpuarrays_root = dirname(dirname(gpuarrays))
-        include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
-    end
-    mod.TestSuite
-end
-testf(f, xs...; kwargs...) = GPUArraysTestSuite.compare(f, CLArray, xs...; kwargs...)
-
-const device_eltypes = Dict()
-function GPUArraysTestSuite.supported_eltypes(::Type{<:CLArray})
-    get!(device_eltypes, cl.device()) do
-        types = [Int16, Int32, Int64,
-                 Complex{Int16}, Complex{Int32}, Complex{Int64},
-                 Float32, ComplexF32]
-        if "cl_khr_fp64" in cl.device().extensions
-            push!(types, Float64)
-            push!(types, ComplexF64)
-        end
-        if "cl_khr_fp16" in cl.device().extensions
-            push!(types, Float16)
-            push!(types, ComplexF16)
-        end
-        return types
-    end
-end
-
-using Random
-
-
-## entry point
-
-const targets = []
-
-function runtests(f, name, platform_filter)
-    old_print_setting = Test.TESTSET_PRINT_ENABLE[]
-    Test.TESTSET_PRINT_ENABLE[] = false
-
-    if isempty(targets)
-        for platform in cl.platforms(),
-            device in cl.devices(platform)
-            if platform_filter !== nothing
-                # filter on the name or vendor
-                names = lowercase.([platform.name, platform.vendor])
-                if !any(contains(platform_filter), names)
-                    continue
-                end
-            end
-            push!(targets, (; platform, device))
-        end
-        if isempty(targets)
-            if platform_filter === nothing
-                throw(ArgumentError("No OpenCL platforms found"))
-            else
-                throw(ArgumentError("No OpenCL platforms found matching $platform_filter"))
-            end
-        end
-    end
-
-    try
-        # generate a temporary module to execute the tests in
-        mod_name = Symbol("Test", rand(1:100), "Main_", replace(name, '/' => '_'))
-        mod = @eval(Main, module $mod_name end)
-        @eval(mod, using Test, Random, OpenCL)
-
-        let id = myid()
-            wait(@spawnat 1 print_testworker_started(name, id))
-        end
-
-        # some tests require native execution capabilities
-        requires_il = name in ["atomics", "execution", "intrinsics", "kernelabstractions"] ||
-                      startswith(name, "gpuarrays/")
-
-        ex = quote
-            GC.gc(true)
-            Random.seed!(1)
-            OpenCL.allowscalar(false)
-
-            @timed @testset $"$name" begin
-                @testset "\$(device.name)" for (; platform, device) in $targets
-                    cl.platform!(platform)
-                    cl.device!(device)
-
-                    if !$requires_il || "cl_khr_il_program" in device.extensions
-                        $f()
-                    end
-                end
-            end
-        end
-        data = Core.eval(mod, ex)
-        #data[1] is the testset
-
-        # process results
-        cpu_rss = Sys.maxrss()
-        compilations = OpenCL.compilations[]
-        if VERSION >= v"1.11.0-DEV.1529"
-            tc = Test.get_test_counts(data[1])
-            passes,fails,error,broken,c_passes,c_fails,c_errors,c_broken =
-                tc.passes, tc.fails, tc.errors, tc.broken, tc.cumulative_passes,
-                tc.cumulative_fails, tc.cumulative_errors, tc.cumulative_broken
-        else
-            passes,fails,errors,broken,c_passes,c_fails,c_errors,c_broken =
-                Test.get_test_counts(data[1])
-        end
-        if data[1].anynonpass == false
-            data = ((passes+c_passes,broken+c_broken),
-                    data[2],
-                    data[3],
-                    data[4],
-                    data[5])
-        end
-        res = vcat(collect(data), cpu_rss, compilations)
-
-        GC.gc(true)
-        res
-    finally
-        Test.TESTSET_PRINT_ENABLE[] = old_print_setting
-    end
-end
-
-
-## auxiliary stuff
-
-# Run some code on-device
-macro on_device(ex...)
-    code = ex[end]
-    kwargs = ex[1:end-1]
-
-    @gensym kernel
-    esc(quote
-        let
-            function $kernel()
-                $code
-                return
-            end
-
-            @opencl $(kwargs...) $kernel()
-            cl.finish(cl.queue())
-        end
-    end)
-end
-
-
-nothing # File is loaded via a remotecall to "include". Ensure it returns "nothing".