|
38 | 38 | opt::ParallelPSOKernel, lock; c1 = 1.4962f0, |
39 | 39 | c2 = 1.4962f0) where {T1, T2} |
40 | 40 | i = @index(Global, Linear) |
41 | | - # FIXME: Determine the right amount of shmem to use |
| 41 | + tidx = @index(Local, Linear) |
42 | 42 |
|
43 | 43 | @uniform gs = @groupsize()[1] |
44 | 44 |
|
45 | 45 | best_queue = @localmem SPSOGBest{T1, T2} (gs) |
46 | 46 | queue_num = @localmem UInt32 1 |
47 | 47 |
|
48 | | - @inbounds gbest = gbest_ref[1] |
49 | | - @inbounds particle = gpu_particles[i] |
| 48 | + particle = @private SPSOParticle{T1, T2} 1 |
50 | 49 |
|
51 | 50 | # Initialize cost to be Inf |
52 | | - for bq_idx in 1:gs |
53 | | - best_queue[bq_idx] = SPSOGBest(particle.best_position, |
54 | | - convert(typeof(particle.cost), Inf)) |
55 | | - end |
| 51 | + @inbounds particle[1] = gpu_particles[i] |
| 52 | + best_queue[tidx] = SPSOGBest(particle[1].best_position, |
| 53 | + convert(T2, Inf)) |
| 54 | + queue_num[1] = UInt32(0) |
56 | 55 |
|
57 | 56 | @synchronize |
58 | 57 |
|
59 | | - particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt) |
60 | | - @inbounds gpu_particles[i] = particle |
| 58 | + @inbounds particle[1] = gpu_particles[i] |
| 59 | + gbest = @inbounds gbest_ref[1] |
| 60 | + @inbounds particle[1] = update_particle_state(particle[1], prob, gbest, w, c1, c2, i, opt) |
61 | 61 |
|
62 | 62 | @synchronize |
63 | 63 |
|
64 | | - if particle.best_cost < gbest.cost |
| 64 | + @inbounds particle[1] = gpu_particles[i] |
| 65 | + gbest = @inbounds gbest_ref[1] |
| 66 | + if particle[1].best_cost < gbest.cost |
65 | 67 | queue_idx = @atomic queue_num[1] += UInt32(1) |
66 | | - @inbounds best_queue[queue_idx] = SPSOGBest(particle.best_position, |
67 | | - particle.best_cost) |
| 68 | + @inbounds best_queue[queue_idx] = SPSOGBest(particle[1].best_position, |
| 69 | + particle[1].best_cost) |
68 | 70 | end |
| 71 | + |
69 | 72 | @synchronize |
70 | | - if i <= first(@ndrange()) |
71 | | - tidx = @index(Local, Linear) |
72 | | - if tidx == 1 |
73 | | - if queue_num[1] > 1 |
74 | | - # Find best fit in block |
75 | | - for j in 2:queue_num[1] |
76 | | - @inbounds if best_queue[j].cost < best_queue[1].cost |
77 | | - best_queue[1] = best_queue[j] |
78 | | - end |
79 | | - end |
80 | 73 |
|
81 | | - # Take lock |
82 | | - while true |
83 | | - res = @atomicreplace lock[1] UInt32(0)=>UInt32(1) |
84 | | - if res.success |
85 | | - break |
86 | | - end |
| 74 | + if tidx == 1 |
| 75 | + if queue_num[1] > 1 |
| 76 | + # Find best fit in block |
| 77 | + for j in 2:queue_num[1] |
| 78 | + @inbounds if best_queue[j].cost < best_queue[1].cost |
| 79 | + best_queue[1] = best_queue[j] |
87 | 80 | end |
| 81 | + end |
88 | 82 |
|
89 | | - # Update global best fit |
90 | | - gbest = @inbounds gbest_ref[1] |
91 | | - @inbounds if best_queue[1].cost < gbest.cost |
92 | | - gbest_ref[1] = best_queue[1] |
| 83 | + # Take lock |
| 84 | + while true |
| 85 | + res = @atomicreplace lock[1] UInt32(0)=>UInt32(1) |
| 86 | + if res.success |
| 87 | + break |
93 | 88 | end |
| 89 | + end |
94 | 90 |
|
95 | | - # Release lock |
96 | | - @atomicswap lock[1] = 0 |
| 91 | + # Update global best fit |
| 92 | + gbest = @inbounds gbest_ref[1] |
| 93 | + @inbounds if best_queue[1].cost < gbest.cost |
| 94 | + gbest_ref[1] = best_queue[1] |
97 | 95 | end |
| 96 | + |
| 97 | + # Release lock |
| 98 | + @atomicswap lock[1] = 0 |
98 | 99 | end |
99 | 100 | end |
100 | 101 | end |
|
0 commit comments