@@ -225,3 +225,330 @@ def make_p2e_loopy_kernel(
225225 loopy_knl = kernel .prepare_loopy_kernel (loopy_knl )
226226
227227 return loopy_knl
228+
229+
230+ def make_l2p_loopy_kernel_for_volume_taylor (expansion , kernels ):
231+ dim = expansion .dim
232+ order = expansion .order
233+ ncoeffs = len (expansion )
234+
235+ code_transformers = [expansion .get_code_transformer ()] \
236+ + [kernel .get_code_transformer () for kernel in kernels ]
237+ pymbolic_conv = sym .SympyToPymbolicMapper ()
238+
239+ max_deriv_order = 0
240+ sym_expr_dicts = []
241+ for kernel in kernels :
242+ expr_dict = {(0 ,)* dim : 1 }
243+ expr_dict = kernel .get_derivative_coeff_dict_at_target (expr_dict )
244+ max_deriv_order = max (max_deriv_order , max (sum (mi ) for mi in expr_dict ))
245+ sym_expr_dict = {}
246+ for mi , coeff in expr_dict .items ():
247+ coeff = pymbolic_conv (coeff )
248+ for transform in code_transformers :
249+ coeff = transform (coeff )
250+ sym_expr_dict [mi ] = coeff
251+ sym_expr_dicts .append (sym_expr_dict )
252+
253+ domains = [
254+ "{[idim]: 0<=idim<dim}" ,
255+ "{[iorder0]: 0<iorder0<=order}" ,
256+ "{[zero_idx]: 0<=zero_idx<max_deriv_order}" ,
257+ "{[icoeff]: 0<=icoeff<ncoeffs}" ,
258+ ]
259+
260+ powers = pymbolic .var ("power_b" )
261+ iorder = pymbolic .var ("iorder0" )
262+ idim = pymbolic .var ("idim" )
263+ result = pymbolic .var ("result" )
264+ b = pymbolic .var ("b" )
265+ center = pymbolic .var ("center" )
266+ target = pymbolic .var ("target" )
267+ rscale = pymbolic .var ("rscale" )
268+ coeffs = pymbolic .var ("coeffs" )
269+ icoeff = pymbolic .var ("icoeff" )
270+ zero_idx = pymbolic .var ("zero_idx" )
271+ temporary_variables = []
272+
273+ insns = [
274+ lp .Assignment (
275+ assignee = b [idim ],
276+ expression = (target [idim ] - center [idim ])* (1 / rscale ),
277+ id = "set_b" ,
278+ temp_var_type = lp .Optional (None ),
279+ ),
280+ # We need negative index access in the array to be zero
281+ # However loopy does not support negative indices, and we
282+ # have an offset of max_deriv_order for array access and
283+ # the first max_deriv_order values are set to zero.
284+ lp .Assignment (
285+ assignee = powers [idim , zero_idx ],
286+ expression = 0 ,
287+ id = "zero_monomials" ,
288+ temp_var_type = lp .Optional (None ),
289+ ),
290+ lp .Assignment (
291+ assignee = powers [idim , max_deriv_order ],
292+ expression = 1 ,
293+ id = "init_monomials" ,
294+ depends_on = frozenset (["zero_monomials" ]),
295+ ),
296+ lp .Assignment (
297+ assignee = powers [idim , max_deriv_order + iorder ],
298+ expression = (
299+ powers [idim , max_deriv_order + iorder - 1 ]* b [idim ]* (1 / iorder )),
300+ id = "update_monomials" ,
301+ depends_on = frozenset (["set_b" , "init_monomials" ]),
302+ ),
303+ ]
304+
305+ optimizations = [lambda knl : lp .tag_inames (knl , "e2p_iorder0:unr" )]
306+ iorder = pymbolic .var ("iorder1" )
307+ wrangler = expansion .expansion_terms_wrangler
308+
309+ from sumpy .expansion import LinearPDEConformingVolumeTaylorExpansion
310+ if not isinstance (expansion , LinearPDEConformingVolumeTaylorExpansion ):
311+ v = [pymbolic .var (f"x{ i } " ) for i in range (dim )]
312+ domains += ["{[iorder1]: 0<=1iorder1<=order}" ]
313+ upper_bound = "iorder1"
314+ for i in range (dim - 1 , 0 , - 1 ):
315+ domains += [f"{{ [{ v [i ]} ]: 0<={ v [i ]} <={ upper_bound } }}" ]
316+ upper_bound += f"-{ v [i ]} "
317+ domains += [f"{{ [{ v [0 ]} ]: { upper_bound } <={ v [0 ]} <={ upper_bound } }}" ]
318+ idx = wrangler .get_storage_index (v , iorder )
319+
320+ for ikernel , expr_dict in enumerate (sym_expr_dicts ):
321+ expr = sum (coeff * prod (powers [i ,
322+ v [i ] + max_deriv_order - mi [i ]] for i in range (dim ))
323+ * (1 / rscale ** sum (mi ))
324+ for mi , coeff in expr_dict .items ())
325+
326+ insn = lp .Assignment (
327+ assignee = result [ikernel ],
328+ expression = (result [ikernel ]
329+ + coeffs [idx ] * expr ),
330+ id = f"write_{ ikernel } " ,
331+ depends_on = frozenset (["update_monomials" ]),
332+ )
333+ insns .append (insn )
334+ optimizations .append (lambda knl : lp .tag_inames (knl , "e2p_iorder1:unr" ))
335+ else :
336+ coeffs_copy = pymbolic .var ("coeffs_copy" )
337+ insns .append (lp .Assignment (
338+ assignee = coeffs_copy [0 , icoeff ],
339+ expression = coeffs [icoeff ],
340+ id = "copy_coeffs" ,
341+ ))
342+ # We need two rows for coeffs_copy since we cannot use inplace
343+ # updates due to parallel updates so we alternatively use
344+ # coeffs_copy[0, :] and coeffs_copy[1, :] to write and read from.
345+ temporary_variables .append (lp .TemporaryVariable (
346+ name = "coeffs_copy" ,
347+ shape = (2 , ncoeffs ),
348+ ))
349+ base_kernel = kernels [0 ].get_base_kernel ()
350+ deriv_id_to_coeff , = base_kernel .get_pde_as_diff_op ().eqs
351+
352+ ordering_key , axis_permutation = \
353+ wrangler ._get_mi_ordering_key_and_axis_permutation ()
354+ max_deriv_id = max (deriv_id_to_coeff , key = ordering_key )
355+ max_mi = max_deriv_id .mi
356+
357+ if all (m != 0 for m in max_mi ):
358+ raise NotImplementedError ("non-elliptic PDEs" )
359+
360+ slowest_axis = axis_permutation [0 ]
361+ c = max_mi [slowest_axis ]
362+ v = [pymbolic .var (f"x{ i } " ) for i in range (dim )]
363+ v [slowest_axis ], v [0 ] = v [0 ], v [slowest_axis ]
364+ x0 = v [0 ]
365+
366+ # sync_split is the maximum number of iterations in v[0] that we can do
367+ # before a synchronization is needed. For Laplace 2D there are two rows
368+ # of stored coeffs, and both of them can be calculated before a sync
369+ # is needed. For biharmonic 2D there are four rows in stored coeffs,
370+ # but synchronization needs to happen every two rows because calculating
371+ # the 6th row needs the 4th row synchronized
372+ sync_split = gcd (* [c - deriv_id .mi [slowest_axis ]
373+ for deriv_id in deriv_id_to_coeff ])
374+
375+ def get_domains (v , iorder , with_sync ):
376+ domains = [f"{{ [{ x0 } _outer]: 0<={ x0 } _outer<={ order // c } }}" ]
377+ if with_sync :
378+ expr = f"{ c // sync_split } *{ x0 } _sync_outer + { c } *{ x0 } _outer"
379+ domains += [f"{{ [{ x0 } _sync_outer]: 0<={ expr } <={ order } "
380+ f"and 0<={ x0 } _sync_outer<{ c // sync_split } }}" ]
381+ expr += f" + { v [0 ]} _inner"
382+ domains += [f"{{ [{ v [0 ]} _inner]: 0<={ expr } <={ order } "
383+ f"and 0<={ v [0 ]} _inner<{ sync_split } }}" ]
384+ else :
385+ expr = f"{ v [0 ]} _inner + { c } *{ x0 } _outer"
386+ domains += [f"{{ [{ v [0 ]} _inner]: 0<={ expr } <={ order } "
387+ f"and 0<={ v [0 ]} _inner<{ c } }}" ]
388+ domains += [f"{{ [{ v [0 ]} ]: { expr } <={ v [0 ]} <={ expr } }}" ]
389+ domains += [f"{{ [{ iorder } ]: { v [0 ]} <={ iorder } <={ order } }}" ]
390+ upper_bound = f"{ iorder } -{ v [0 ]} "
391+ for i in range (dim - 1 , 1 , - 1 ):
392+ domains += [f"{{ [{ v [i ]} ]: 0<={ v [i ]} <={ upper_bound } }}" ]
393+ upper_bound += f"-{ v [i ]} "
394+ domains += [
395+ f"{{ [{ v [1 ]} ]: { upper_bound } <={ v [1 ]} <={ upper_bound } }}" ]
396+ return domains
397+
398+ def get_idx (v ):
399+ idx_sym = list (v )
400+ idx_sym [0 ] = v [0 ] % c
401+ idx = wrangler .get_storage_index (idx_sym )
402+ return idx
403+
404+ domains += get_domains (v , iorder , with_sync = True )
405+ idx = get_idx (v )
406+
407+ if c == sync_split :
408+ # We do not need to sync within the c rows.
409+ # Update the values from the c rows set coeffs_copy[p, :] from
410+ # the previous c rows set coeffs_copy[p-1, :]
411+ # and then read from coeffs_copy[p, :].
412+ # This code path is different to avoid an extra copy and
413+ # a synchronization step.
414+ prev_copy_idx = (v [0 ]// c - 1 ) % 2
415+ curr_copy_idx = (v [0 ]// c ) % 2
416+ else :
417+ # We need to sync within the c rows.
418+ # Using the biharmonic 2D example:
419+ # - Update the rows 4, 5 at coeffs_copy[1, :] from values at
420+ # coeffs_copy[0, :]
421+ # - Synchronize
422+ # - Copy the rows 4, 5 from coeffs_copy[1, :] to coeffs_copy[0, :]
423+ # - Synchronize
424+ # - Update the rows 6, 7 at coeffs_copy[1, :] from values at
425+ # coeffs_copy[0, :]
426+ # - Synchronize
427+ # - Copy the rows 6, 7 from coeffs_copy[1, :] to coeffs_copy[0, :]
428+ # - Synchronize
429+ # - Read the rows 4, 5, 6, 7 from coeffs_copy[0, :]
430+ prev_copy_idx = 0
431+ curr_copy_idx = 1
432+
433+ max_mi_sym = [v [i ] - max_mi [i ] for i in range (dim )]
434+ scale = - 1 / deriv_id_to_coeff [max_deriv_id ]
435+ expr = 0
436+ for deriv_id , pde_coeff in deriv_id_to_coeff .items ():
437+ if deriv_id == max_deriv_id :
438+ continue
439+ mi_sym = [max_mi_sym [i ] + deriv_id .mi [i ] for i in range (dim )]
440+ mi_sym [0 ] = mi_sym [0 ] % c
441+ expr += (coeffs_copy [prev_copy_idx ,
442+ wrangler .get_storage_index (mi_sym )]
443+ * (rscale ** (sum (max_mi ) - sum (deriv_id .mi ))
444+ * pymbolic_conv (pde_coeff ) * scale ))
445+
446+ insns .append (lp .Assignment (
447+ assignee = coeffs_copy [curr_copy_idx , idx ],
448+ expression = expr ,
449+ id = "update_coeffs" ,
450+ depends_on = frozenset (["copy_coeffs" ]),
451+ depends_on_is_final = True ,
452+ predicates = frozenset ([prim .Comparison (v [0 ], ">=" , c )]),
453+ ))
454+
455+ if c != sync_split :
456+ # We now copy before synchronization
457+ v = [pymbolic .var (f"z{ i } " ) for i in range (dim )]
458+ v [slowest_axis ], v [0 ] = v [0 ], v [slowest_axis ]
459+ iorder = pymbolic .var ("iorder3" )
460+ idx = get_idx (v )
461+ domains += get_domains (v , iorder , with_sync = True )[2 :]
462+
463+ insns .append (lp .Assignment (
464+ assignee = coeffs_copy [0 , idx ],
465+ expression = coeffs_copy [1 , idx ],
466+ id = "copy_sync" ,
467+ depends_on = frozenset (["update_coeffs" ]),
468+ depends_on_is_final = True ,
469+ predicates = frozenset ([prim .Comparison (v [0 ], ">=" , c )]),
470+ ))
471+
472+ v = [pymbolic .var (f"y{ i } " ) for i in range (dim )]
473+ v [slowest_axis ], v [0 ] = v [0 ], v [slowest_axis ]
474+ iorder = pymbolic .var ("iorder2" )
475+ idx = get_idx (v )
476+ domains += get_domains (v , iorder , with_sync = False )[1 :]
477+
478+ if c == sync_split :
479+ # We did not have to sync within the c rows.
480+ # We last wrote to coeffs_copy[v[0]//c % 2, :] and we read from it.
481+ fetch_idx = (v [0 ]// c ) % 2
482+ else :
483+ # We need to sync within the c rows.
484+ # We last wrote to coeffs_copy[0, :] and we read from it.
485+ fetch_idx = 0
486+
487+ for ikernel , expr_dict in enumerate (sym_expr_dicts ):
488+ expr = sum (coeff * prod (powers [i ,
489+ v [i ] + max_deriv_order - mi [i ]] for i in range (dim ))
490+ * (1 / rscale ** sum (mi ))
491+ for mi , coeff in expr_dict .items ())
492+
493+ insn = lp .Assignment (
494+ assignee = result [ikernel ],
495+ expression = (result [ikernel ]
496+ + coeffs_copy [fetch_idx , idx ] * expr ),
497+ id = f"write_{ ikernel } " ,
498+ depends_on = frozenset (["update_monomials" ,
499+ "update_coeffs" if c == sync_split else "copy_sync" ]),
500+ depends_on_is_final = True ,
501+ )
502+ insns .append (insn )
503+
504+ tags = {
505+ "e2p_iorder1" : "l.0" ,
506+ f"e2p_{ x0 } _outer" : "unr" ,
507+ f"e2p_{ x0 } _inner" : "unr" ,
508+ f"e2p_{ v [0 ]} _inner" : "unr" ,
509+ "e2p_iorder2" : "unr" ,
510+ }
511+ if c != sync_split :
512+ tags ["e2p_iorder3" ] = "l.0"
513+
514+ optimizations += [
515+ lambda knl : lp .tag_inames (knl , tags ),
516+ lambda knl : lp .set_temporary_address_space (knl , "e2p_coeffs_copy" ,
517+ lp .AddressSpace .LOCAL ),
518+ lambda knl : lp .split_iname (knl , "e2p_icoeff" , 32 , inner_tag = "l.0" ),
519+ ]
520+
521+ target_args = gather_loopy_arguments ((expansion ,) + tuple (kernels ))
522+ loopy_knl = lp .make_function (domains , insns ,
523+ kernel_data = [
524+ lp .GlobalArg ("result" , shape = (len (kernels ),), is_input = True ,
525+ is_output = True ),
526+ lp .GlobalArg ("coeffs" ,
527+ shape = (ncoeffs ,), is_input = True , is_output = False ),
528+ lp .GlobalArg ("center, target" ,
529+ shape = (dim ,), is_input = True , is_output = False ),
530+ lp .ValueArg ("rscale" , is_input = True ),
531+ lp .ValueArg ("itgt" , is_input = True ),
532+ lp .ValueArg ("ntargets" , is_input = True ),
533+ lp .GlobalArg ("targets" ,
534+ shape = (dim , "ntargets" ), is_input = True , is_output = False ),
535+ * target_args ,
536+ * temporary_variables ,
537+ ...],
538+ name = "e2p" ,
539+ lang_version = lp .MOST_RECENT_LANGUAGE_VERSION ,
540+ fixed_parameters = {
541+ "dim" : dim ,
542+ "nresults" : len (kernels ),
543+ "order" : order ,
544+ "max_deriv_order" : max_deriv_order ,
545+ "ncoeffs" : ncoeffs ,
546+ },
547+ )
548+
549+ loopy_knl = lp .tag_inames (loopy_knl , "idim*:unr" )
550+
551+ for kernel in kernels :
552+ loopy_knl = kernel .prepare_loopy_kernel (loopy_knl )
553+
554+ return loopy_knl , optimizations
0 commit comments