2727import sumpy .symbolic as sym
2828from sumpy .assignment_collection import SymbolicAssignmentCollection
2929from sumpy .tools import gather_loopy_arguments
30- from math import prod
30+ from math import prod , gcd
3131
3232import logging
3333logger = logging .getLogger (__name__ )
@@ -239,6 +239,9 @@ def l2p_loopy_volume_taylor(expansion, kernels):
239239 expression = coeffs [icoeff ],
240240 id = "copy_coeffs" ,
241241 ))
242+ # We need two rows for coeffs_copy since we cannot use inplace
243+ # updates due to parallel updates so we alternatively use
244+ # coeffs_copy[0, :] and coeffs_copy[1, :] to write and read from.
242245 temporary_variables .append (lp .TemporaryVariable (
243246 name = "coeffs_copy" ,
244247 shape = (2 , ncoeffs ),
@@ -254,16 +257,34 @@ def l2p_loopy_volume_taylor(expansion, kernels):
254257 if all (m != 0 for m in max_mi ):
255258 raise NotImplementedError ("non-elliptic PDEs" )
256259
257- c = max_mi [axis_permutation [0 ]]
260+ slowest_axis = axis_permutation [0 ]
261+ c = max_mi [slowest_axis ]
258262 v = [pymbolic .var (f"x{ i } " ) for i in range (dim )]
259- v [axis_permutation [ 0 ]] , v [0 ] = v [0 ], v [axis_permutation [ 0 ] ]
263+ v [slowest_axis ] , v [0 ] = v [0 ], v [slowest_axis ]
260264 x0 = v [0 ]
261265
262- def get_domains (v , iorder ):
266+ # sync_split is the maximum number of iterations in v[0] that we can do
267+ # before a synchronization is needed. For Laplace 2D there are two rows
268+ # of stored coeffs, and both of them can be calculated before a sync
269+ # is needed. For biharmonic 2D there are four rows in stored coeffs,
270+ # but synchronization needs to happen every two rows because calculating
271+ # the 6th row needs the 4th row synchronized
272+ sync_split = gcd (* [c - deriv_id .mi [slowest_axis ]
273+ for deriv_id in deriv_id_to_coeff ])
274+
275+ def get_domains (v , iorder , with_sync ):
263276 domains = [f"{{ [{ x0 } _outer]: 0<={ x0 } _outer<={ order // c } }}" ]
264- expr = f"{ v [0 ]} _inner + { c } *{ x0 } _outer"
265- domains += [f"{{ [{ v [0 ]} _inner]: 0<={ expr } <=order "
266- f"and 0<={ v [0 ]} _inner<{ c } }}" ]
277+ if with_sync :
278+ expr = f"{ c // sync_split } *{ x0 } _sync_outer + { c } *{ x0 } _outer"
279+ domains += [f"{{ [{ x0 } _sync_outer]: 0<={ expr } <=order "
280+ f"and 0<={ x0 } _sync_outer<{ c // sync_split } }}" ]
281+ expr += f" + { v [0 ]} _inner"
282+ domains += [f"{{ [{ v [0 ]} _inner]: 0<={ expr } <=order "
283+ f"and 0<={ v [0 ]} _inner<{ sync_split } }}" ]
284+ else :
285+ expr = f"{ v [0 ]} _inner + { c } *{ x0 } _outer"
286+ domains += [f"{{ [{ v [0 ]} _inner]: 0<={ expr } <=order "
287+ f"and 0<={ v [0 ]} _inner<{ c } }}" ]
267288 domains += [f"{{ [{ v [0 ]} ]: { expr } <={ v [0 ]} <={ expr } }}" ]
268289 domains += [f"{{ [{ iorder } ]: { v [0 ]} <={ iorder } <=order }}" ]
269290 upper_bound = f"{ iorder } -{ v [0 ]} "
@@ -280,9 +301,37 @@ def get_idx(v):
280301 idx = wrangler .get_storage_index (idx_sym )
281302 return idx
282303
283- domains += get_domains (v , iorder )
304+ domains += get_domains (v , iorder , with_sync = True )
284305 idx = get_idx (v )
285306
307+ if c == sync_split :
308+ # We do not need to sync within the c rows.
309+ # Update the values from the c rows set coeffs_copy[p, :] from
310+ # the previous c rows set coeffs_copy[p-1, :]
311+ # and then read from coeffs_copy[p, :].
312+ # This code path is different to avoid an extra copy and
313+ # a synchronization step.
314+ prev_copy_idx = (v [0 ]// c - 1 ) % 2
315+ curr_copy_idx = (v [0 ]// c ) % 2
316+ fetch_idx = (v [0 ]// c ) % 2
317+ else :
318+ # We need to sync within the c rows.
319+ # Using the biharmonic 2D example:
320+ # - Update the rows 4, 5 at coeffs_copy[1, :] from values at
321+ # coeffs_copy[0, :]
322+ # - Synchronize
323+ # - Copy the rows 4, 5 from coeffs_copy[1, :] to coeffs_copy[0, :]
324+ # - Synchronize
325+ # - Update the rows 6, 7 at coeffs_copy[1, :] from values at
326+ # coeffs_copy[0, :]
327+ # - Synchronize
328+ # - Copy the rows 6, 7 from coeffs_copy[1, :] to coeffs_copy[0, :]
329+ # - Synchronize
330+ # - Read the rows 4, 5, 6, 7 from coeffs_copy[0, :]
331+ prev_copy_idx = 0
332+ curr_copy_idx = 1
333+ fetch_idx = 0
334+
286335 max_mi_sym = [v [i ] - max_mi [i ] for i in range (dim )]
287336 scale = - 1 / deriv_id_to_coeff [max_deriv_id ]
288337 expr = 0
@@ -291,25 +340,42 @@ def get_idx(v):
291340 continue
292341 mi_sym = [max_mi_sym [i ] + deriv_id .mi [i ] for i in range (dim )]
293342 mi_sym [0 ] = mi_sym [0 ] % c
294- expr += (coeffs_copy [( v [ 0 ] // c + 1 ) % 2 ,
343+ expr += (coeffs_copy [prev_copy_idx ,
295344 wrangler .get_storage_index (mi_sym )]
296345 * (rscale ** (sum (max_mi ) - sum (deriv_id .mi ))
297346 * pymbolic_conv (pde_coeff ) * scale ))
298347
299348 insns .append (lp .Assignment (
300- assignee = coeffs_copy [( v [ 0 ] // c ) % 2 , idx ],
349+ assignee = coeffs_copy [curr_copy_idx , idx ],
301350 expression = expr ,
302351 id = "update_coeffs" ,
303352 depends_on = frozenset (["copy_coeffs" ]),
304353 depends_on_is_final = True ,
305354 predicates = frozenset ([prim .Comparison (v [0 ], ">=" , c )]),
306355 ))
307356
357+ if c != sync_split :
358+ # We now copy before synchronization
359+ v = [pymbolic .var (f"z{ i } " ) for i in range (dim )]
360+ v [slowest_axis ], v [0 ] = v [0 ], v [slowest_axis ]
361+ iorder = pymbolic .var ("iorder3" )
362+ idx = get_idx (v )
363+ domains += get_domains (v , iorder , with_sync = True )[2 :]
364+
365+ insns .append (lp .Assignment (
366+ assignee = coeffs_copy [0 , idx ],
367+ expression = coeffs_copy [1 , idx ],
368+ id = "copy_sync" ,
369+ depends_on = frozenset (["update_coeffs" ]),
370+ depends_on_is_final = True ,
371+ predicates = frozenset ([prim .Comparison (v [0 ], ">=" , c )]),
372+ ))
373+
308374 v = [pymbolic .var (f"y{ i } " ) for i in range (dim )]
309- v [axis_permutation [ 0 ]] , v [0 ] = v [0 ], v [axis_permutation [ 0 ] ]
375+ v [slowest_axis ] , v [0 ] = v [0 ], v [slowest_axis ]
310376 iorder = pymbolic .var ("iorder2" )
311377 idx = get_idx (v )
312- domains += get_domains (v , iorder )[1 :]
378+ domains += get_domains (v , iorder , with_sync = False )[1 :]
313379
314380 for ikernel , expr_dict in enumerate (sym_expr_dicts ):
315381 expr = sum (coeff * prod (powers [i ,
@@ -320,20 +386,26 @@ def get_idx(v):
320386 insn = lp .Assignment (
321387 assignee = result [ikernel ],
322388 expression = (result [ikernel ]
323- + coeffs_copy [( v [ 0 ] // c ) % 2 , idx ] * expr ),
389+ + coeffs_copy [fetch_idx , idx ] * expr ),
324390 id = f"write_{ ikernel } " ,
325- depends_on = frozenset (["update_monomials" , "update_coeffs" ]),
391+ depends_on = frozenset (["update_monomials" ,
392+ "update_coeffs" if c == sync_split else "copy_sync" ]),
326393 depends_on_is_final = True ,
327394 )
328395 insns .append (insn )
396+
397+ tags = {
398+ "e2p_iorder1" : "l.0" ,
399+ f"e2p_{ x0 } _outer" : "unr" ,
400+ f"e2p_{ x0 } _inner" : "unr" ,
401+ f"e2p_{ v [0 ]} _inner" : "unr" ,
402+ "e2p_iorder2" : "unr" ,
403+ }
404+ if c != sync_split :
405+ tags ["e2p_iorder3" ] = "l.0"
406+
329407 optimizations += [
330- lambda knl : lp .tag_inames (knl , {
331- "e2p_iorder1" : "l.0" ,
332- f"e2p_{ x0 } _outer" : "unr" ,
333- f"e2p_{ x0 } _inner" : "unr" ,
334- f"e2p_{ v [0 ]} _inner" : "unr" ,
335- "e2p_iorder2" : "unr" ,
336- }),
408+ lambda knl : lp .tag_inames (knl , tags ),
337409 lambda knl : lp .set_temporary_address_space (knl , "e2p_coeffs_copy" ,
338410 lp .AddressSpace .LOCAL ),
339411 lambda knl : lp .split_iname (knl , "e2p_icoeff" , 32 , inner_tag = "l.0" ),
0 commit comments