SciML
diff --git a/‎.buildkite/runtests.yml‎
Lines changed: 29 additions & 1 deletion b/‎.buildkite/runtests.yml‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/neural_network/nn.jl‎
Lines changed: 12 additions & 16 deletions b/‎examples/neural_network/nn.jl‎
Lines changed: 12 additions & 16 deletions
diff --git a/‎examples/ode_estimation/Lotka_Volterra/lotka_volterra.jl‎
Lines changed: 2 additions & 2 deletions b/‎examples/ode_estimation/Lotka_Volterra/lotka_volterra.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/PSOGPU.jl‎
Lines changed: 25 additions & 21 deletions b/‎src/PSOGPU.jl‎
Lines changed: 25 additions & 21 deletions
diff --git a/‎src/ode_pso.jl‎
Lines changed: 34 additions & 43 deletions b/‎src/ode_pso.jl‎
Lines changed: 34 additions & 43 deletions
@@ -26,6 +26,34 @@ steps:
     # Don't run Buildkite if the commit message includes the text [skip tests]
     if: build.message !~ /\[skip tests\]/
 
+  - label: ":julia: [AMDGPU] Run tests on Julia v{{matrix.version}}"
+    matrix:
+      setup:
+        version:
+          - "1"
+    env:
+      GROUP: AMDGPU
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "{{matrix.version}}"
+    commands:
+      - |
+        julia --project=test -e '
+          import Pkg
+          Pkg.add(; name = "AMDGPU")'
+        rm test/Manifest.toml
+        julia --project -e'
+          import Pkg
+          println("+++ :julia: Running tests")
+          Pkg.test(; coverage=false)'
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    timeout_in_minutes: 120
+    # Don't run Buildkite if the commit message includes the text [skip tests]
+    if: build.message !~ /\[skip tests\]/
+
 env:
   JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
-  # SECRET_CODECOV_TOKEN: "..."
+  # SECRET_CODECOV_TOKEN: "..."
@@ -4,8 +4,8 @@ authors = ["Utkarsh <[email protected]> and contributors"]
 version = "1.0.0-DEV"
 
 [deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 
@@ -2,7 +2,7 @@ using SimpleChains
 using IterTools
 using MLDatasets
 using Random
-dataset =  MLDatasets.Iris().dataframe
+dataset = MLDatasets.Iris().dataframe
 
 data = Array(dataset)
 data = data[shuffle(1:end), :]
@@ -17,11 +17,9 @@ function mapstrtoclass(flower)
     end
 end
 ytrain = map(mapstrtoclass, data[:, 5])
-lenet = SimpleChain(
-    static(4),
+lenet = SimpleChain(static(4),
     TurboDense{true}(tanh, 20),
-    TurboDense{true}(identity, 3),
-)
+    TurboDense{true}(identity, 3))
 lenetloss = SimpleChains.add_loss(lenet, LogitCrossEntropyLoss(ytrain))
 
 p = SimpleChains.init_params(lenet);
@@ -30,18 +28,16 @@ G = SimpleChains.alloc_threaded_grad(lenet);
 
 lenetloss(xtrain, p)
 
-report = let mlpdloss = lenetloss, X=xtrain
-p -> begin
-      let train = mlpdloss(X, p)
-        @info "Loss:" train
-      end
+report = let mlpdloss = lenetloss, X = xtrain
+    p -> begin
+        let train = mlpdloss(X, p)
+            @info "Loss:" train
+        end
     end
 end
 
 for _ in 1:3
-    @time SimpleChains.train_unbatched!(
-      G, p, lenetloss, xtrain, SimpleChains.ADAM(), 5000
-    );
+    @time SimpleChains.train_unbatched!(G, p, lenetloss, xtrain, SimpleChains.ADAM(), 5000)
     report(p)
 end
 
@@ -53,10 +49,10 @@ using Optimization, PSOGPU
 
 lb = -ones(length(p)) .* 10
 ub = ones(length(p)) .* 10
-prob = OptimizationProblem((u,data) -> lenetloss(data, u), p, xtrain; lb = lb, ub = ub)
+prob = OptimizationProblem((u, data) -> lenetloss(data, u), p, xtrain; lb = lb, ub = ub)
 
 n_particles = 1000
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = true),
-    maxiters = 1000)
+    ParallelPSOKernel(n_particles; threaded = true),
+    maxiters = 1000)
@@ -1,4 +1,4 @@
-using StaticArrays, SciMLBase
+using StaticArrays, SciMLBase, OrdinaryDiffEq
 
 function f(u, p, t)
     dx = p[1] * u[1] - p[2] * u[1] * u[2]
@@ -46,4 +46,4 @@ ub = SVector{length(optprob.u0), eltype(optprob.u0)}(fill(eltype(optprob.u0)(Inf
     gbest,
     gpu_data,
     lb,
-    ub; saveat = t, dt = 0.1)
+    ub; saveat = t, dt = 0.1, backend = CUDABackend())
@@ -1,6 +1,6 @@
 module PSOGPU
 
-using SciMLBase, StaticArrays, Setfield, CUDA
+using SciMLBase, StaticArrays, Setfield, KernelAbstractions
 
 import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible
 
@@ -19,29 +19,26 @@ struct PSOGBest{T1, T2 <: eltype(T1)}
     cost::T2
 end
 
-struct ParallelPSOKernel
+struct ParallelPSOKernel{Backend}
     num_particles::Int
     async::Bool
-    gpu::Bool
     threaded::Bool
+    backend::Backend
 end
-struct ParallelSyncPSO
+struct ParallelSyncPSO{Backend}
     num_particles::Int
+    backend::Backend
 end
 
 function ParallelPSOKernel(num_particles::Int;
-        async = false,
-        gpu = false, threaded = false)
-    ParallelPSOKernel(num_particles, async, gpu, threaded)
+        async = false, threaded = false, backend = CPU())
+    ParallelPSOKernel(num_particles, async, threaded, backend)
 end
 
 SciMLBase.allowsbounds(::ParallelPSOKernel) = true
 SciMLBase.allowsbounds(::ParallelSyncPSO) = true
 # SciMLBase.requiresbounds(::ParallelPSOKernel) = true
 
-struct GPU end
-struct CPU end
-
 include("./pso_cpu.jl")
 include("./pso_gpu.jl")
 include("./pso_async_gpu.jl")
@@ -58,24 +55,29 @@ function SciMLBase.__solve(prob::OptimizationProblem,
 
     prob = remake(prob; lb = lb, ub = ub)
 
-    if !(opt.gpu)
+    ## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations
+    if opt.backend isa CPU
         if opt.threaded
             gbest = PSO(prob; population = opt.num_particles, kwargs...)
         else
             init_gbest, particles = init_particles(prob, opt.num_particles)
             gbest = pso_solve_cpu!(prob, init_gbest, particles; kwargs...)
         end
     else
+        backend = opt.backend
+        init_gbest, particles = init_particles(prob, opt.num_particles)
+        # TODO: Do the equivalent of cu()/roc()
+        particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
+        gpu_particles = KernelAbstractions.allocate(backend,
+            particles_eltype,
+            size(particles))
+        copyto!(gpu_particles, particles)
+        gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
+        copyto!(gpu_init_gbest, [init_gbest])
         if opt.async
-            init_gbest, particles = init_particles(prob, opt.num_particles)
-            gpu_particles = cu(particles)
-            init_gbest = cu([init_gbest])
-            gbest = pso_solve_async_gpu!(prob, init_gbest, gpu_particles; kwargs...)
+            gbest = pso_solve_async_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
         else
-            init_gbest, particles = init_particles(prob, opt.num_particles)
-            gpu_particles = cu(particles)
-            init_gbest = cu([init_gbest])
-            gbest = pso_solve_gpu!(prob, init_gbest, gpu_particles; kwargs...)
+            gbest = pso_solve_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
         end
     end
 
@@ -91,9 +93,11 @@ function SciMLBase.__solve(prob::OptimizationProblem,
     ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
 
     prob = remake(prob; lb = lb, ub = ub)
-
+    backend = opt.backend
     init_gbest, particles = init_particles(prob, opt.num_particles)
-    gpu_particles = cu(particles)
+    particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
+    gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
+    copyto!(gpu_particles, particles)
     init_gbest = init_gbest
     gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...)
 
 
@@ -1,47 +1,44 @@
-function _update_particle_states!(gpu_particles, lb, ub, gbest, w; c1 = 1.4962f0,
+@kernel function _update_particle_states!(gpu_particles, lb, ub, gbest, w; c1 = 1.4962f0,
         c2 = 1.4962f0)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    i > length(gpu_particles) && return
+    i = @index(Global, Linear)
+    if i <= length(gpu_particles)
+        @inbounds particle = gpu_particles[i]
 
-    @inbounds particle = gpu_particles[i]
+        updated_velocity = w .* particle.velocity .+
+                           c1 .* rand(typeof(particle.velocity)) .*
+                           (particle.best_position -
+                            particle.position) .+
+                           c2 .* rand(typeof(particle.velocity)) .*
+                           (gbest.position - particle.position)
 
-    updated_velocity = w .* particle.velocity .+
-                       c1 .* rand(typeof(particle.velocity)) .* (particle.best_position -
-                        particle.position) .+
-                       c2 .* rand(typeof(particle.velocity)) .*
-                       (gbest.position - particle.position)
+        @set! particle.velocity = updated_velocity
 
-    @set! particle.velocity = updated_velocity
+        @set! particle.position = particle.position + particle.velocity
 
-    @set! particle.position = particle.position + particle.velocity
+        update_pos = max(particle.position, lb)
+        update_pos = min(update_pos, ub)
 
-    update_pos = max(particle.position, lb)
-    update_pos = min(update_pos, ub)
+        @set! particle.position = update_pos
 
-    @set! particle.position = update_pos
-
-    @inbounds gpu_particles[i] = particle
-
-    return nothing
+        @inbounds gpu_particles[i] = particle
+    end
 end
 
-function _update_particle_costs!(losses, gpu_particles)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    i > length(losses) && return
+@kernel function _update_particle_costs!(losses, gpu_particles)
+    i = @index(Global, Linear)
+    if i <= length(losses)
+        @inbounds particle = gpu_particles[i]
+        @inbounds loss = losses[i]
 
-    @inbounds particle = gpu_particles[i]
-    @inbounds loss = losses[i]
+        @set! particle.cost = loss
 
-    @set! particle.cost = loss
+        if particle.cost < particle.best_cost
+            @set! particle.best_position = particle.position
+            @set! particle.best_cost = particle.cost
+        end
 
-    if particle.cost < particle.best_cost
-        @set! particle.best_position = particle.position
-        @set! particle.best_cost = particle.cost
+        @inbounds gpu_particles[i] = particle
     end
-
-    @inbounds gpu_particles[i] = particle
-
-    return nothing
 end
 
 function default_prob_func(prob, gpu_particle)
@@ -59,16 +56,11 @@ function parameter_estim_ode!(prob::ODEProblem,
         w = 0.72980f0,
         wdamp = 1.0f0,
         maxiters = 100, kwargs...)
-    update_states! = @cuda launch=false PSOGPU._update_particle_states!(gpu_particles, lb,
-        ub,
-        gbest,
-        w)
-
-    losses = CUDA.ones(1, length(gpu_particles))
-    update_costs! = @cuda launch=false PSOGPU._update_particle_costs!(losses, gpu_particles)
+    backend = get_backend(gpu_particles)
+    update_states! = PSOGPU._update_particle_states!(backend)
 
-    config_states = launch_configuration(update_states!.fun)
-    config_costs = launch_configuration(update_costs!.fun)
+    losses = KernelAbstractions.ones(backend, 1, length(gpu_particles))
+    update_costs! = PSOGPU._update_particle_costs!(backend)
 
     improb = make_prob_compatible(prob)
 
@@ -78,8 +70,7 @@ function parameter_estim_ode!(prob::ODEProblem,
             ub,
             gbest,
             w;
-            config_states.threads,
-            config_states...)
+            ndrange = length(gpu_particles))
 
         probs = prob_func.(Ref(improb), gpu_particles)
 
@@ -89,7 +80,7 @@ function parameter_estim_ode!(prob::ODEProblem,
 
         sum!(losses, (map(x -> sum(x .^ 2), data .- us)))
 
-        update_costs!(losses, gpu_particles; config_costs.threads, config_costs...)
+        update_costs!(losses, gpu_particles; ndrange = length(losses))
 
         best_particle = minimum(gpu_particles,
             init = PSOGPU.PSOParticle(gbest.position,