You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if (length(optvars)==0) @IncoherentArgumentError("incoherent argument memopt in @parallel[_indices] <kernel>: optimization can only be applied if there is at least one array that is read-only within the kernel (and accessed with a multi-point stencil). Set memopt=false for this kernel.") end
@@ -125,7 +126,7 @@ function memopt(metadata_module::Module, is_parallel_kernel::Bool, caller::Modul
125
126
126
127
#TODO: replace wrap_if where possible with in-line if - compare performance when doing it
127
128
body =quote
128
-
$loopoffset = (@blockIdx().z-1)*$loopsize #TODO: MOVE UP - see no perf change! interchange other lines!
129
+
$loopoffset = (@blockIdx().z-1)*$loopsize +$range_z_start-1#TODO: MOVE UP - see no perf change! interchange other lines!
129
130
$((quote
130
131
$tx =@threadIdx().x +$hx1
131
132
$ty =@threadIdx().y +$hy1
@@ -164,9 +165,12 @@ $((:( $reg = 0.0
164
165
# for $i = $loopstart:$(mainloopstart-1)
165
166
$(wrap_loop(i, loopstart:mainloopstart-1,
166
167
quote
167
-
$tz_g =$i +$loopoffset
168
-
if ($tz_g >$rangelength_z) ParallelStencil.@return_nothing; end
169
-
$iz = ($tz_g <1) ?$range_z_start-(1-$tz_g) :$range_z #TODO: this will probably always be formulated with range_z_start
168
+
$iz =$i +$loopoffset
169
+
if ($iz >$range_z_end) ParallelStencil.@return_nothing; end
170
+
# NOTE: the following is now fully included in the loopoffset (0.25% performance gain measured on H100) but is still of interest if we implement step ranges:
171
+
# $tz_g = $i + $loopoffset
172
+
# if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
173
+
# $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
@@ -212,9 +216,12 @@ $(( # NOTE: the if statement is not needed here as we only deal with registers
212
216
# for $i = $mainloopstart:$mainloopend # ParallelStencil.@unroll
213
217
$(wrap_loop(i, mainloopstart:mainloopend,
214
218
quote
215
-
$tz_g =$i +$loopoffset
216
-
if ($tz_g >$rangelength_z) ParallelStencil.@return_nothing; end
217
-
$iz = ($tz_g <1) ?$range_z_start-(1-$tz_g) :$range_z #TODO: this will probably always be formulated with range_z_start
219
+
$iz =$i +$loopoffset
220
+
if ($iz >$range_z_end) ParallelStencil.@return_nothing; end
221
+
# NOTE: the following is now fully included in the loopoffset (0.25% performance gain measured on H100) but is still of interest if we implement step ranges:
222
+
# $tz_g = $i + $loopoffset
223
+
# if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
224
+
# $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
@@ -545,7 +552,8 @@ function remove_single_point_optvars(optvars, optranges_arg, offsets, offsets_by
545
552
returntuple((A for A in optvars if!(length(keys(offsets[A]))==1&&length(keys(offsets_by_z[A]))==1) || (!isnothing(optranges_arg) && A ∈keys(optranges_arg)))...)
0 commit comments