@@ -123,8 +123,8 @@ function Base._mapreducedim!(f, op, R::GPUArray, A::GPUSrcArray)
123
123
return R
124
124
end
125
125
126
- simple_broadcast_index (A:: AbstractArray , i) = A[i]
127
- simple_broadcast_index (x, i) = x
126
+ @inline simple_broadcast_index (A:: AbstractArray , i... ) = @inbounds A[i... ]
127
+ @inline simple_broadcast_index (x, i... ) = x
128
128
129
129
for i = 0 : 10
130
130
args = ntuple (x-> Symbol (" arg_" , x), i)
@@ -138,19 +138,19 @@ for i = 0:10
138
138
# # Loop sequentially over chunks of input vector
139
139
# HACK: length(A) and axes(A) aren't GPU compatible, so pass them instead
140
140
# https://github.com/JuliaGPU/CUDAnative.jl/issues/367
141
- while global_index <= len
141
+ @inbounds while global_index <= len
142
142
cartesian_global_index = Tuple (CartesianIndices (ax)[global_index])
143
143
@inbounds element = f (A[cartesian_global_index... ], $ (fargs... ))
144
144
acc = op (acc, element)
145
145
global_index += global_size (state)
146
146
end
147
147
# Perform parallel reduction
148
148
local_index = threadidx_x (state) - 1
149
- tmp_local[local_index + 1 ] = acc
149
+ @inbounds tmp_local[local_index + 1 ] = acc
150
150
synchronize_threads (state)
151
151
152
152
offset = blockdim_x (state) ÷ 2
153
- while offset > 0
153
+ @inbounds while offset > 0
154
154
if (local_index < offset)
155
155
other = tmp_local[local_index + offset + 1 ]
156
156
mine = tmp_local[local_index + 1 ]
@@ -160,7 +160,7 @@ for i = 0:10
160
160
offset = offset ÷ 2
161
161
end
162
162
if local_index == 0
163
- result[blockidx_x (state)] = tmp_local[1 ]
163
+ @inbounds result[blockidx_x (state)] = tmp_local[1 ]
164
164
end
165
165
return
166
166
end
0 commit comments