JuliaGPU · vchuravy · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024 · vchuravy
diff --git a/Project.toml b/Project.toml
@@ -32,7 +32,7 @@ StaticArrays = "0.12, 1.0"
 UUIDs = "<0.0.1, 1.6"
 UnsafeAtomics = "0.2.1"
 UnsafeAtomicsLLVM = "0.1, 0.2"
-julia = "1.6"
+julia = "1.10"
 
 [extensions]
 EnzymeExt = "EnzymeCore"

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -8,11 +8,11 @@ using KernelAbstractions
 using Random
 
 if !haskey(ENV, "KA_BACKEND")
-    const BACKEND = CPU()
+    const BACKEND = OpenCLBackend()
 else
     backend = ENV["KA_BACKEND"]
     if backend == "CPU"
-        const BACKEND = CPU()
+        const BACKEND = OpenCLBackend()
     elseif backend == "CUDA"
         using CUDA
         const BACKEND = CUDABackend()

diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
@@ -27,13 +27,13 @@ end
 ## Launching kernel on the host
 
 You can construct a kernel for a specific backend by calling the kernel with
-`mul2_kernel(CPU(), 16)`. The first argument is a backend of type `KA.Backend`,
+`mul2_kernel(OpenCLBackend(), 16)`. The first argument is a backend of type `KA.Backend`,
 the second argument being the workgroup size. This returns a generated kernel
 executable that is then executed with the input argument `A` and the additional
 argument being a static `ndrange`.
 
 ```julia
-dev = CPU()
+dev = OpenCLBackend()
 A = ones(1024, 1024)
 ev = mul2_kernel(dev, 64)(A, ndrange=size(A))
 synchronize(dev)

diff --git a/examples/histogram.jl b/examples/histogram.jl
@@ -94,7 +94,7 @@ end
         histogram!(rand_histogram, rand_input)
         histogram!(linear_histogram, linear_input)
         histogram!(two_histogram, all_two)
-        KernelAbstractions.synchronize(CPU())
+        KernelAbstractions.synchronize(backend)
 
         @test isapprox(Array(rand_histogram), histogram_rand_baseline)
         @test isapprox(Array(linear_histogram), histogram_linear_baseline)

diff --git a/examples/numa_aware.jl b/examples/numa_aware.jl
@@ -19,7 +19,7 @@ Estimate the memory bandwidth (GB/s) by performing a time measurement of a
 SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s).
 """
 function measure_membw(
-        backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
+        backend = OpenCLBackend(); verbose = true, N = 1024 * 500_000, dtype = Float32,
         init = :parallel,
     )
     bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY
@@ -52,8 +52,8 @@ function measure_membw(
 end
 
 # Static should be much better (on a system with multiple NUMA domains)
-measure_membw(CPU());
-measure_membw(CPU(; static = true));
+measure_membw(OpenCLBackend());
+# measure_membw(OpenCLBackend(; static = true));
 
 # The following has significantly worse performance (even on systems with a single memory domain)!
 # measure_membw(CPU(); init=:serial);

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -35,6 +35,7 @@ and then invoked on the arguments.
 - [`@uniform`](@ref)
 - [`@synchronize`](@ref)
 - [`@print`](@ref)
+- [`@context`](@ref)
 
 # Example:
 
@@ -51,45 +52,33 @@ synchronize(backend)
 ```
 """
 macro kernel(expr)
-    __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
+    __kernel(expr, #=force_inbounds=# false)
 end
 
 """
-    @kernel config function f(args) end
-
-This allows for two different configurations:
-
-1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions.
-2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful!
-
-- [`@context`](@ref)
+    @kernel inbounds={false, true} function f(args) end
 
 !!! warn
     This is an experimental feature.
 """
 macro kernel(ex...)
     if length(ex) == 1
-        __kernel(ex[1], true, false)
+        __kernel(ex[1], false)
     else
-        generate_cpu = true
         force_inbounds = false
         for i in 1:(length(ex) - 1)
             if ex[i] isa Expr && ex[i].head == :(=) &&
-                    ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
-                generate_cpu = ex[i].args[2]
-            elseif ex[i] isa Expr && ex[i].head == :(=) &&
                     ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
                 force_inbounds = ex[i].args[2]
             else
                 error(
                     "Configuration should be of form:\n" *
-                        "* `cpu=true`\n" *
                         "* `inbounds=false`\n" *
                         "got `", ex[i], "`",
                 )
             end
         end
-        __kernel(ex[end], generate_cpu, force_inbounds)
+        __kernel(ex[end], force_inbounds)
     end
 end
 
@@ -198,47 +187,6 @@ macro localmem(T, dims)
     end
 end
 
-"""
-    @private T dims
-
-Declare storage that is local to each item in the workgroup. This can be safely used
-across [`@synchronize`](@ref) statements. On a CPU, this will allocate additional implicit
-dimensions to ensure correct localization.
-
-For storage that only persists between `@synchronize` statements, an `MArray` can be used
-instead.
-
-See also [`@uniform`](@ref).
-"""
-macro private(T, dims)
-    if dims isa Integer
-        dims = (dims,)
-    end
-    quote
-        $Scratchpad($(esc(:__ctx__)), $(esc(T)), Val($(esc(dims))))
-    end
-end
-
-"""
-    @private mem = 1
-
-Creates a private local of `mem` per item in the workgroup. This can be safely used
-across [`@synchronize`](@ref) statements.
-"""
-macro private(expr)
-    esc(expr)
-end
-
-"""
-    @uniform expr
-
-`expr` is evaluated outside the workitem scope. This is useful for variable declarations
-that span workitems, or are reused across `@synchronize` statements.
-"""
-macro uniform(value)
-    esc(value)
-end
-
 """
     @synchronize()
 
@@ -258,10 +206,6 @@ end
 After a `@synchronize` statement all read and writes to global and local memory
 from each thread in the workgroup are visible in from all other threads in the
 workgroup. `cond` is not allowed to have any visible sideffects.
-
-# Platform differences
-  - `GPU`: This synchronization will only occur if the `cond` evaluates.
-  - `CPU`: This synchronization will always occur.
 """
 macro synchronize(cond)
     quote
@@ -274,16 +218,13 @@ end
 
 Access the hidden context object used by KernelAbstractions.
 
-!!! warn
-    Only valid to be used from a kernel with `cpu=false`.
-
 ```
 function f(@context, a)
     I = @index(Global, Linear)
     a[I]
 end
 
-@kernel cpu=false function my_kernel(a)
+@kernel function my_kernel(a)
     f(@context, a)
 end
 ```
@@ -296,10 +237,6 @@ end
     @print(items...)
 
 This is a unified print statement.
-
-# Platform differences
-  - `GPU`: This will reorganize the items to print via `@cuprintf`
-  - `CPU`: This will call `print(items...)`
 """
 macro print(items...)
 
@@ -420,37 +357,6 @@ Abstract type for all KernelAbstractions backends.
 """
 abstract type Backend end
 
-"""
-Abstract type for all GPU based KernelAbstractions backends.
-
-!!! note
-    New backend implementations **must** sub-type this abstract type.
-"""
-abstract type GPU <: Backend end
-
-"""
-    CPU(; static=false)
-
-Instantiate a CPU (multi-threaded) backend.
-
-## Options:
- - `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code.
-   Defaults to false.
-"""
-struct CPU <: Backend
-    static::Bool
-    CPU(; static::Bool = false) = new(static)
-end
-
-"""
-    isgpu(::Backend)::Bool
-
-Returns true for all [`GPU`](@ref) backends.
-"""
-isgpu(::GPU) = true
-isgpu(::CPU) = false
-
-
 """
     get_backend(A::AbstractArray)::Backend
 
@@ -465,12 +371,9 @@ function get_backend end
 # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
 get_backend(A::AbstractArray) = get_backend(parent(A))
 
-get_backend(::Array) = CPU()
-
 # Define:
 #   adapt_storage(::Backend, a::Array) = adapt(BackendArray, a)
 #   adapt_storage(::Backend, a::BackendArray) = a
-Adapt.adapt_storage(::CPU, a::Array) = a
 
 """
     allocate(::Backend, Type, dims...)::AbstractArray