Skip to content

Commit 1e5ce2d

Browse files
authored
Merge pull request #28 from SciML/jps/ka
Switch to KernelAbstractions
2 parents 7bbd997 + 55bd9cc commit 1e5ce2d

File tree

13 files changed

+242
-242
lines changed

13 files changed

+242
-242
lines changed

.buildkite/runtests.yml

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,34 @@ steps:
2626
# Don't run Buildkite if the commit message includes the text [skip tests]
2727
if: build.message !~ /\[skip tests\]/
2828

29+
- label: ":julia: [AMDGPU] Run tests on Julia v{{matrix.version}}"
30+
matrix:
31+
setup:
32+
version:
33+
- "1"
34+
env:
35+
GROUP: AMDGPU
36+
plugins:
37+
- JuliaCI/julia#v1:
38+
version: "{{matrix.version}}"
39+
commands:
40+
- |
41+
julia --project=test -e '
42+
import Pkg
43+
Pkg.add(; name = "AMDGPU")'
44+
rm test/Manifest.toml
45+
julia --project -e'
46+
import Pkg
47+
println("+++ :julia: Running tests")
48+
Pkg.test(; coverage=false)'
49+
agents:
50+
queue: "juliagpu"
51+
rocm: "*"
52+
rocmgpu: "*"
53+
timeout_in_minutes: 120
54+
# Don't run Buildkite if the commit message includes the text [skip tests]
55+
if: build.message !~ /\[skip tests\]/
56+
2957
env:
3058
JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
31-
# SECRET_CODECOV_TOKEN: "..."
59+
# SECRET_CODECOV_TOKEN: "..."

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ authors = ["Utkarsh <[email protected]> and contributors"]
44
version = "1.0.0-DEV"
55

66
[deps]
7-
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
87
DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
8+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
99
MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
1010
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1111
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"

examples/neural_network/nn.jl

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ using SimpleChains
22
using IterTools
33
using MLDatasets
44
using Random
5-
dataset = MLDatasets.Iris().dataframe
5+
dataset = MLDatasets.Iris().dataframe
66

77
data = Array(dataset)
88
data = data[shuffle(1:end), :]
@@ -17,11 +17,9 @@ function mapstrtoclass(flower)
1717
end
1818
end
1919
ytrain = map(mapstrtoclass, data[:, 5])
20-
lenet = SimpleChain(
21-
static(4),
20+
lenet = SimpleChain(static(4),
2221
TurboDense{true}(tanh, 20),
23-
TurboDense{true}(identity, 3),
24-
)
22+
TurboDense{true}(identity, 3))
2523
lenetloss = SimpleChains.add_loss(lenet, LogitCrossEntropyLoss(ytrain))
2624

2725
p = SimpleChains.init_params(lenet);
@@ -30,18 +28,16 @@ G = SimpleChains.alloc_threaded_grad(lenet);
3028

3129
lenetloss(xtrain, p)
3230

33-
report = let mlpdloss = lenetloss, X=xtrain
34-
p -> begin
35-
let train = mlpdloss(X, p)
36-
@info "Loss:" train
37-
end
31+
report = let mlpdloss = lenetloss, X = xtrain
32+
p -> begin
33+
let train = mlpdloss(X, p)
34+
@info "Loss:" train
35+
end
3836
end
3937
end
4038

4139
for _ in 1:3
42-
@time SimpleChains.train_unbatched!(
43-
G, p, lenetloss, xtrain, SimpleChains.ADAM(), 5000
44-
);
40+
@time SimpleChains.train_unbatched!(G, p, lenetloss, xtrain, SimpleChains.ADAM(), 5000)
4541
report(p)
4642
end
4743

@@ -53,10 +49,10 @@ using Optimization, PSOGPU
5349

5450
lb = -ones(length(p)) .* 10
5551
ub = ones(length(p)) .* 10
56-
prob = OptimizationProblem((u,data) -> lenetloss(data, u), p, xtrain; lb = lb, ub = ub)
52+
prob = OptimizationProblem((u, data) -> lenetloss(data, u), p, xtrain; lb = lb, ub = ub)
5753

5854
n_particles = 1000
5955

6056
sol = solve(prob,
61-
ParallelPSOKernel(n_particles; gpu = false, threaded = true),
62-
maxiters = 1000)
57+
ParallelPSOKernel(n_particles; threaded = true),
58+
maxiters = 1000)

examples/ode_estimation/Lotka_Volterra/lotka_volterra.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using StaticArrays, SciMLBase
1+
using StaticArrays, SciMLBase, OrdinaryDiffEq
22

33
function f(u, p, t)
44
dx = p[1] * u[1] - p[2] * u[1] * u[2]
@@ -46,4 +46,4 @@ ub = SVector{length(optprob.u0), eltype(optprob.u0)}(fill(eltype(optprob.u0)(Inf
4646
gbest,
4747
gpu_data,
4848
lb,
49-
ub; saveat = t, dt = 0.1)
49+
ub; saveat = t, dt = 0.1, backend = CUDABackend())

src/PSOGPU.jl

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module PSOGPU
22

3-
using SciMLBase, StaticArrays, Setfield, CUDA
3+
using SciMLBase, StaticArrays, Setfield, KernelAbstractions
44

55
import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible
66

@@ -19,29 +19,26 @@ struct PSOGBest{T1, T2 <: eltype(T1)}
1919
cost::T2
2020
end
2121

22-
struct ParallelPSOKernel
22+
struct ParallelPSOKernel{Backend}
2323
num_particles::Int
2424
async::Bool
25-
gpu::Bool
2625
threaded::Bool
26+
backend::Backend
2727
end
28-
struct ParallelSyncPSO
28+
struct ParallelSyncPSO{Backend}
2929
num_particles::Int
30+
backend::Backend
3031
end
3132

3233
function ParallelPSOKernel(num_particles::Int;
33-
async = false,
34-
gpu = false, threaded = false)
35-
ParallelPSOKernel(num_particles, async, gpu, threaded)
34+
async = false, threaded = false, backend = CPU())
35+
ParallelPSOKernel(num_particles, async, threaded, backend)
3636
end
3737

3838
SciMLBase.allowsbounds(::ParallelPSOKernel) = true
3939
SciMLBase.allowsbounds(::ParallelSyncPSO) = true
4040
# SciMLBase.requiresbounds(::ParallelPSOKernel) = true
4141

42-
struct GPU end
43-
struct CPU end
44-
4542
include("./pso_cpu.jl")
4643
include("./pso_gpu.jl")
4744
include("./pso_async_gpu.jl")
@@ -58,24 +55,29 @@ function SciMLBase.__solve(prob::OptimizationProblem,
5855

5956
prob = remake(prob; lb = lb, ub = ub)
6057

61-
if !(opt.gpu)
58+
## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations
59+
if opt.backend isa CPU
6260
if opt.threaded
6361
gbest = PSO(prob; population = opt.num_particles, kwargs...)
6462
else
6563
init_gbest, particles = init_particles(prob, opt.num_particles)
6664
gbest = pso_solve_cpu!(prob, init_gbest, particles; kwargs...)
6765
end
6866
else
67+
backend = opt.backend
68+
init_gbest, particles = init_particles(prob, opt.num_particles)
69+
# TODO: Do the equivalent of cu()/roc()
70+
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
71+
gpu_particles = KernelAbstractions.allocate(backend,
72+
particles_eltype,
73+
size(particles))
74+
copyto!(gpu_particles, particles)
75+
gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
76+
copyto!(gpu_init_gbest, [init_gbest])
6977
if opt.async
70-
init_gbest, particles = init_particles(prob, opt.num_particles)
71-
gpu_particles = cu(particles)
72-
init_gbest = cu([init_gbest])
73-
gbest = pso_solve_async_gpu!(prob, init_gbest, gpu_particles; kwargs...)
78+
gbest = pso_solve_async_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
7479
else
75-
init_gbest, particles = init_particles(prob, opt.num_particles)
76-
gpu_particles = cu(particles)
77-
init_gbest = cu([init_gbest])
78-
gbest = pso_solve_gpu!(prob, init_gbest, gpu_particles; kwargs...)
80+
gbest = pso_solve_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
7981
end
8082
end
8183

@@ -91,9 +93,11 @@ function SciMLBase.__solve(prob::OptimizationProblem,
9193
ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
9294

9395
prob = remake(prob; lb = lb, ub = ub)
94-
96+
backend = opt.backend
9597
init_gbest, particles = init_particles(prob, opt.num_particles)
96-
gpu_particles = cu(particles)
98+
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
99+
gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
100+
copyto!(gpu_particles, particles)
97101
init_gbest = init_gbest
98102
gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...)
99103

src/ode_pso.jl

Lines changed: 34 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,44 @@
1-
function _update_particle_states!(gpu_particles, lb, ub, gbest, w; c1 = 1.4962f0,
1+
@kernel function _update_particle_states!(gpu_particles, lb, ub, gbest, w; c1 = 1.4962f0,
22
c2 = 1.4962f0)
3-
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
4-
i > length(gpu_particles) && return
3+
i = @index(Global, Linear)
4+
if i <= length(gpu_particles)
5+
@inbounds particle = gpu_particles[i]
56

6-
@inbounds particle = gpu_particles[i]
7+
updated_velocity = w .* particle.velocity .+
8+
c1 .* rand(typeof(particle.velocity)) .*
9+
(particle.best_position -
10+
particle.position) .+
11+
c2 .* rand(typeof(particle.velocity)) .*
12+
(gbest.position - particle.position)
713

8-
updated_velocity = w .* particle.velocity .+
9-
c1 .* rand(typeof(particle.velocity)) .* (particle.best_position -
10-
particle.position) .+
11-
c2 .* rand(typeof(particle.velocity)) .*
12-
(gbest.position - particle.position)
14+
@set! particle.velocity = updated_velocity
1315

14-
@set! particle.velocity = updated_velocity
16+
@set! particle.position = particle.position + particle.velocity
1517

16-
@set! particle.position = particle.position + particle.velocity
18+
update_pos = max(particle.position, lb)
19+
update_pos = min(update_pos, ub)
1720

18-
update_pos = max(particle.position, lb)
19-
update_pos = min(update_pos, ub)
21+
@set! particle.position = update_pos
2022

21-
@set! particle.position = update_pos
22-
23-
@inbounds gpu_particles[i] = particle
24-
25-
return nothing
23+
@inbounds gpu_particles[i] = particle
24+
end
2625
end
2726

28-
function _update_particle_costs!(losses, gpu_particles)
29-
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
30-
i > length(losses) && return
27+
@kernel function _update_particle_costs!(losses, gpu_particles)
28+
i = @index(Global, Linear)
29+
if i <= length(losses)
30+
@inbounds particle = gpu_particles[i]
31+
@inbounds loss = losses[i]
3132

32-
@inbounds particle = gpu_particles[i]
33-
@inbounds loss = losses[i]
33+
@set! particle.cost = loss
3434

35-
@set! particle.cost = loss
35+
if particle.cost < particle.best_cost
36+
@set! particle.best_position = particle.position
37+
@set! particle.best_cost = particle.cost
38+
end
3639

37-
if particle.cost < particle.best_cost
38-
@set! particle.best_position = particle.position
39-
@set! particle.best_cost = particle.cost
40+
@inbounds gpu_particles[i] = particle
4041
end
41-
42-
@inbounds gpu_particles[i] = particle
43-
44-
return nothing
4542
end
4643

4744
function default_prob_func(prob, gpu_particle)
@@ -59,16 +56,11 @@ function parameter_estim_ode!(prob::ODEProblem,
5956
w = 0.72980f0,
6057
wdamp = 1.0f0,
6158
maxiters = 100, kwargs...)
62-
update_states! = @cuda launch=false PSOGPU._update_particle_states!(gpu_particles, lb,
63-
ub,
64-
gbest,
65-
w)
66-
67-
losses = CUDA.ones(1, length(gpu_particles))
68-
update_costs! = @cuda launch=false PSOGPU._update_particle_costs!(losses, gpu_particles)
59+
backend = get_backend(gpu_particles)
60+
update_states! = PSOGPU._update_particle_states!(backend)
6961

70-
config_states = launch_configuration(update_states!.fun)
71-
config_costs = launch_configuration(update_costs!.fun)
62+
losses = KernelAbstractions.ones(backend, 1, length(gpu_particles))
63+
update_costs! = PSOGPU._update_particle_costs!(backend)
7264

7365
improb = make_prob_compatible(prob)
7466

@@ -78,8 +70,7 @@ function parameter_estim_ode!(prob::ODEProblem,
7870
ub,
7971
gbest,
8072
w;
81-
config_states.threads,
82-
config_states...)
73+
ndrange = length(gpu_particles))
8374

8475
probs = prob_func.(Ref(improb), gpu_particles)
8576

@@ -89,7 +80,7 @@ function parameter_estim_ode!(prob::ODEProblem,
8980

9081
sum!(losses, (map(x -> sum(x .^ 2), data .- us)))
9182

92-
update_costs!(losses, gpu_particles; config_costs.threads, config_costs...)
83+
update_costs!(losses, gpu_particles; ndrange = length(losses))
9384

9485
best_particle = minimum(gpu_particles,
9586
init = PSOGPU.PSOParticle(gbest.position,

0 commit comments

Comments
 (0)