Skip to content

Commit 66e1fa4

Browse files
committed
Optimize L2P for GPUs
1 parent 574f89f commit 66e1fa4

File tree

6 files changed

+341
-0
lines changed

6 files changed

+341
-0
lines changed

sumpy/expansion/local.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,14 @@ def loopy_translate_from(self, src_expansion):
405405
f"A direct loopy kernel for translation from "
406406
f"{src_expansion} to {self} is not implemented.")
407407

408+
def loopy_evaluate(self, kernels):
409+
from sumpy.expansion.loopy import (make_l2p_loopy_kernel_for_volume_taylor,
410+
make_l2p_loopy_kernel)
411+
try:
412+
return make_l2p_loopy_kernel_for_volume_taylor(self, kernels)
413+
except NotImplementedError:
414+
return make_e2p_loopy_kernel(self, kernels)
415+
408416

409417
class VolumeTaylorLocalExpansion(
410418
VolumeTaylorExpansion,

sumpy/expansion/loopy.py

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,330 @@ def make_p2e_loopy_kernel(
225225
loopy_knl = kernel.prepare_loopy_kernel(loopy_knl)
226226

227227
return loopy_knl
228+
229+
230+
def make_l2p_loopy_kernel_for_volume_taylor(expansion, kernels):
231+
dim = expansion.dim
232+
order = expansion.order
233+
ncoeffs = len(expansion)
234+
235+
code_transformers = [expansion.get_code_transformer()] \
236+
+ [kernel.get_code_transformer() for kernel in kernels]
237+
pymbolic_conv = sym.SympyToPymbolicMapper()
238+
239+
max_deriv_order = 0
240+
sym_expr_dicts = []
241+
for kernel in kernels:
242+
expr_dict = {(0,)*dim: 1}
243+
expr_dict = kernel.get_derivative_coeff_dict_at_target(expr_dict)
244+
max_deriv_order = max(max_deriv_order, max(sum(mi) for mi in expr_dict))
245+
sym_expr_dict = {}
246+
for mi, coeff in expr_dict.items():
247+
coeff = pymbolic_conv(coeff)
248+
for transform in code_transformers:
249+
coeff = transform(coeff)
250+
sym_expr_dict[mi] = coeff
251+
sym_expr_dicts.append(sym_expr_dict)
252+
253+
domains = [
254+
"{[idim]: 0<=idim<dim}",
255+
"{[iorder0]: 0<iorder0<=order}",
256+
"{[zero_idx]: 0<=zero_idx<max_deriv_order}",
257+
"{[icoeff]: 0<=icoeff<ncoeffs}",
258+
]
259+
260+
powers = pymbolic.var("power_b")
261+
iorder = pymbolic.var("iorder0")
262+
idim = pymbolic.var("idim")
263+
result = pymbolic.var("result")
264+
b = pymbolic.var("b")
265+
center = pymbolic.var("center")
266+
target = pymbolic.var("target")
267+
rscale = pymbolic.var("rscale")
268+
coeffs = pymbolic.var("coeffs")
269+
icoeff = pymbolic.var("icoeff")
270+
zero_idx = pymbolic.var("zero_idx")
271+
temporary_variables = []
272+
273+
insns = [
274+
lp.Assignment(
275+
assignee=b[idim],
276+
expression=(target[idim] - center[idim])*(1/rscale),
277+
id="set_b",
278+
temp_var_type=lp.Optional(None),
279+
),
280+
# We need negative index access in the array to be zero
281+
# However loopy does not support negative indices, and we
282+
# have an offset of max_deriv_order for array access and
283+
# the first max_deriv_order values are set to zero.
284+
lp.Assignment(
285+
assignee=powers[idim, zero_idx],
286+
expression=0,
287+
id="zero_monomials",
288+
temp_var_type=lp.Optional(None),
289+
),
290+
lp.Assignment(
291+
assignee=powers[idim, max_deriv_order],
292+
expression=1,
293+
id="init_monomials",
294+
depends_on=frozenset(["zero_monomials"]),
295+
),
296+
lp.Assignment(
297+
assignee=powers[idim, max_deriv_order + iorder],
298+
expression=(
299+
powers[idim, max_deriv_order + iorder - 1]*b[idim]*(1/iorder)),
300+
id="update_monomials",
301+
depends_on=frozenset(["set_b", "init_monomials"]),
302+
),
303+
]
304+
305+
optimizations = [lambda knl: lp.tag_inames(knl, "e2p_iorder0:unr")]
306+
iorder = pymbolic.var("iorder1")
307+
wrangler = expansion.expansion_terms_wrangler
308+
309+
from sumpy.expansion import LinearPDEConformingVolumeTaylorExpansion
310+
if not isinstance(expansion, LinearPDEConformingVolumeTaylorExpansion):
311+
v = [pymbolic.var(f"x{i}") for i in range(dim)]
312+
domains += ["{[iorder1]: 0<=1iorder1<=order}"]
313+
upper_bound = "iorder1"
314+
for i in range(dim - 1, 0, -1):
315+
domains += [f"{{ [{v[i]}]: 0<={v[i]}<={upper_bound} }}"]
316+
upper_bound += f"-{v[i]}"
317+
domains += [f"{{ [{v[0]}]: {upper_bound}<={v[0]}<={upper_bound} }}"]
318+
idx = wrangler.get_storage_index(v, iorder)
319+
320+
for ikernel, expr_dict in enumerate(sym_expr_dicts):
321+
expr = sum(coeff * prod(powers[i,
322+
v[i] + max_deriv_order - mi[i]] for i in range(dim))
323+
* (1 / rscale ** sum(mi))
324+
for mi, coeff in expr_dict.items())
325+
326+
insn = lp.Assignment(
327+
assignee=result[ikernel],
328+
expression=(result[ikernel]
329+
+ coeffs[idx] * expr),
330+
id=f"write_{ikernel}",
331+
depends_on=frozenset(["update_monomials"]),
332+
)
333+
insns.append(insn)
334+
optimizations.append(lambda knl: lp.tag_inames(knl, "e2p_iorder1:unr"))
335+
else:
336+
coeffs_copy = pymbolic.var("coeffs_copy")
337+
insns.append(lp.Assignment(
338+
assignee=coeffs_copy[0, icoeff],
339+
expression=coeffs[icoeff],
340+
id="copy_coeffs",
341+
))
342+
# We need two rows for coeffs_copy since we cannot use inplace
343+
# updates due to parallel updates so we alternatively use
344+
# coeffs_copy[0, :] and coeffs_copy[1, :] to write and read from.
345+
temporary_variables.append(lp.TemporaryVariable(
346+
name="coeffs_copy",
347+
shape=(2, ncoeffs),
348+
))
349+
base_kernel = kernels[0].get_base_kernel()
350+
deriv_id_to_coeff, = base_kernel.get_pde_as_diff_op().eqs
351+
352+
ordering_key, axis_permutation = \
353+
wrangler._get_mi_ordering_key_and_axis_permutation()
354+
max_deriv_id = max(deriv_id_to_coeff, key=ordering_key)
355+
max_mi = max_deriv_id.mi
356+
357+
if all(m != 0 for m in max_mi):
358+
raise NotImplementedError("non-elliptic PDEs")
359+
360+
slowest_axis = axis_permutation[0]
361+
c = max_mi[slowest_axis]
362+
v = [pymbolic.var(f"x{i}") for i in range(dim)]
363+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
364+
x0 = v[0]
365+
366+
# sync_split is the maximum number of iterations in v[0] that we can do
367+
# before a synchronization is needed. For Laplace 2D there are two rows
368+
# of stored coeffs, and both of them can be calculated before a sync
369+
# is needed. For biharmonic 2D there are four rows in stored coeffs,
370+
# but synchronization needs to happen every two rows because calculating
371+
# the 6th row needs the 4th row synchronized
372+
sync_split = gcd(*[c - deriv_id.mi[slowest_axis]
373+
for deriv_id in deriv_id_to_coeff])
374+
375+
def get_domains(v, iorder, with_sync):
376+
domains = [f"{{ [{x0}_outer]: 0<={x0}_outer<={order//c} }}"]
377+
if with_sync:
378+
expr = f"{c//sync_split}*{x0}_sync_outer + {c}*{x0}_outer"
379+
domains += [f"{{ [{x0}_sync_outer]: 0<={expr}<={order} "
380+
f"and 0<={x0}_sync_outer<{c//sync_split} }}"]
381+
expr += f" + {v[0]}_inner"
382+
domains += [f"{{ [{v[0]}_inner]: 0<={expr}<={order} "
383+
f"and 0<={v[0]}_inner<{sync_split} }}"]
384+
else:
385+
expr = f"{v[0]}_inner + {c}*{x0}_outer"
386+
domains += [f"{{ [{v[0]}_inner]: 0<={expr}<={order} "
387+
f"and 0<={v[0]}_inner<{c} }}"]
388+
domains += [f"{{ [{v[0]}]: {expr}<={v[0]}<={expr} }}"]
389+
domains += [f"{{ [{iorder}]: {v[0]}<={iorder}<={order} }}"]
390+
upper_bound = f"{iorder}-{v[0]}"
391+
for i in range(dim - 1, 1, -1):
392+
domains += [f"{{ [{v[i]}]: 0<={v[i]}<={upper_bound} }}"]
393+
upper_bound += f"-{v[i]}"
394+
domains += [
395+
f"{{ [{v[1]}]: {upper_bound}<={v[1]}<={upper_bound} }}"]
396+
return domains
397+
398+
def get_idx(v):
399+
idx_sym = list(v)
400+
idx_sym[0] = v[0] % c
401+
idx = wrangler.get_storage_index(idx_sym)
402+
return idx
403+
404+
domains += get_domains(v, iorder, with_sync=True)
405+
idx = get_idx(v)
406+
407+
if c == sync_split:
408+
# We do not need to sync within the c rows.
409+
# Update the values from the c rows set coeffs_copy[p, :] from
410+
# the previous c rows set coeffs_copy[p-1, :]
411+
# and then read from coeffs_copy[p, :].
412+
# This code path is different to avoid an extra copy and
413+
# a synchronization step.
414+
prev_copy_idx = (v[0]//c - 1) % 2
415+
curr_copy_idx = (v[0]//c) % 2
416+
else:
417+
# We need to sync within the c rows.
418+
# Using the biharmonic 2D example:
419+
# - Update the rows 4, 5 at coeffs_copy[1, :] from values at
420+
# coeffs_copy[0, :]
421+
# - Synchronize
422+
# - Copy the rows 4, 5 from coeffs_copy[1, :] to coeffs_copy[0, :]
423+
# - Synchronize
424+
# - Update the rows 6, 7 at coeffs_copy[1, :] from values at
425+
# coeffs_copy[0, :]
426+
# - Synchronize
427+
# - Copy the rows 6, 7 from coeffs_copy[1, :] to coeffs_copy[0, :]
428+
# - Synchronize
429+
# - Read the rows 4, 5, 6, 7 from coeffs_copy[0, :]
430+
prev_copy_idx = 0
431+
curr_copy_idx = 1
432+
433+
max_mi_sym = [v[i] - max_mi[i] for i in range(dim)]
434+
scale = -1/deriv_id_to_coeff[max_deriv_id]
435+
expr = 0
436+
for deriv_id, pde_coeff in deriv_id_to_coeff.items():
437+
if deriv_id == max_deriv_id:
438+
continue
439+
mi_sym = [max_mi_sym[i] + deriv_id.mi[i] for i in range(dim)]
440+
mi_sym[0] = mi_sym[0] % c
441+
expr += (coeffs_copy[prev_copy_idx,
442+
wrangler.get_storage_index(mi_sym)]
443+
* (rscale**(sum(max_mi) - sum(deriv_id.mi))
444+
* pymbolic_conv(pde_coeff) * scale))
445+
446+
insns.append(lp.Assignment(
447+
assignee=coeffs_copy[curr_copy_idx, idx],
448+
expression=expr,
449+
id="update_coeffs",
450+
depends_on=frozenset(["copy_coeffs"]),
451+
depends_on_is_final=True,
452+
predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
453+
))
454+
455+
if c != sync_split:
456+
# We now copy before synchronization
457+
v = [pymbolic.var(f"z{i}") for i in range(dim)]
458+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
459+
iorder = pymbolic.var("iorder3")
460+
idx = get_idx(v)
461+
domains += get_domains(v, iorder, with_sync=True)[2:]
462+
463+
insns.append(lp.Assignment(
464+
assignee=coeffs_copy[0, idx],
465+
expression=coeffs_copy[1, idx],
466+
id="copy_sync",
467+
depends_on=frozenset(["update_coeffs"]),
468+
depends_on_is_final=True,
469+
predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
470+
))
471+
472+
v = [pymbolic.var(f"y{i}") for i in range(dim)]
473+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
474+
iorder = pymbolic.var("iorder2")
475+
idx = get_idx(v)
476+
domains += get_domains(v, iorder, with_sync=False)[1:]
477+
478+
if c == sync_split:
479+
# We did not have to sync within the c rows.
480+
# We last wrote to coeffs_copy[v[0]//c % 2, :] and we read from it.
481+
fetch_idx = (v[0]//c) % 2
482+
else:
483+
# We need to sync within the c rows.
484+
# We last wrote to coeffs_copy[0, :] and we read from it.
485+
fetch_idx = 0
486+
487+
for ikernel, expr_dict in enumerate(sym_expr_dicts):
488+
expr = sum(coeff * prod(powers[i,
489+
v[i] + max_deriv_order - mi[i]] for i in range(dim))
490+
* (1 / rscale ** sum(mi))
491+
for mi, coeff in expr_dict.items())
492+
493+
insn = lp.Assignment(
494+
assignee=result[ikernel],
495+
expression=(result[ikernel]
496+
+ coeffs_copy[fetch_idx, idx] * expr),
497+
id=f"write_{ikernel}",
498+
depends_on=frozenset(["update_monomials",
499+
"update_coeffs" if c == sync_split else "copy_sync"]),
500+
depends_on_is_final=True,
501+
)
502+
insns.append(insn)
503+
504+
tags = {
505+
"e2p_iorder1": "l.0",
506+
f"e2p_{x0}_outer": "unr",
507+
f"e2p_{x0}_inner": "unr",
508+
f"e2p_{v[0]}_inner": "unr",
509+
"e2p_iorder2": "unr",
510+
}
511+
if c != sync_split:
512+
tags["e2p_iorder3"] = "l.0"
513+
514+
optimizations += [
515+
lambda knl: lp.tag_inames(knl, tags),
516+
lambda knl: lp.set_temporary_address_space(knl, "e2p_coeffs_copy",
517+
lp.AddressSpace.LOCAL),
518+
lambda knl: lp.split_iname(knl, "e2p_icoeff", 32, inner_tag="l.0"),
519+
]
520+
521+
target_args = gather_loopy_arguments((expansion,) + tuple(kernels))
522+
loopy_knl = lp.make_function(domains, insns,
523+
kernel_data=[
524+
lp.GlobalArg("result", shape=(len(kernels),), is_input=True,
525+
is_output=True),
526+
lp.GlobalArg("coeffs",
527+
shape=(ncoeffs,), is_input=True, is_output=False),
528+
lp.GlobalArg("center, target",
529+
shape=(dim,), is_input=True, is_output=False),
530+
lp.ValueArg("rscale", is_input=True),
531+
lp.ValueArg("itgt", is_input=True),
532+
lp.ValueArg("ntargets", is_input=True),
533+
lp.GlobalArg("targets",
534+
shape=(dim, "ntargets"), is_input=True, is_output=False),
535+
*target_args,
536+
*temporary_variables,
537+
...],
538+
name="e2p",
539+
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
540+
fixed_parameters={
541+
"dim": dim,
542+
"nresults": len(kernels),
543+
"order": order,
544+
"max_deriv_order": max_deriv_order,
545+
"ncoeffs": ncoeffs,
546+
},
547+
)
548+
549+
loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
550+
551+
for kernel in kernels:
552+
loopy_knl = kernel.prepare_loopy_kernel(loopy_knl)
553+
554+
return loopy_knl, optimizations

sumpy/fmm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,7 @@ def eval_multipoles(self,
933933
result=pot,
934934

935935
rscale=self.level_to_rscale(isrc_level),
936+
max_ntargets_in_one_box=self.max_ntargets_in_one_box,
936937

937938
wait_for=wait_for,
938939

@@ -1069,6 +1070,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
10691070
result=pot,
10701071

10711072
rscale=self.level_to_rscale(lev),
1073+
max_ntargets_in_one_box=self.max_ntargets_in_one_box,
10721074

10731075
**kwargs)
10741076
events.append(evt)

sumpy/symbolic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def _find_symbolic_backend():
118118
functions = sym.functions
119119
Number = sym.Number
120120
Float = sym.Float
121+
diff = sym.diff
121122

122123

123124
def _coeff_isneg(a):

sumpy/toys.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def _e2p(psource, targets, e2p):
250250
target_boxes=boxes,
251251
box_target_starts=box_target_starts,
252252
box_target_counts_nonchild=box_target_counts_nonchild,
253+
max_ntargets_in_one_box=ntargets,
253254
centers=centers,
254255
rscale=psource.rscale,
255256
targets=targets,

test/test_kernels.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative
367367
target_boxes=source_boxes,
368368
box_target_starts=box_target_starts,
369369
box_target_counts_nonchild=box_target_counts_nonchild,
370+
max_ntargets_in_one_box=ntargets,
370371
centers=centers,
371372
targets=targets,
372373
rscale=rscale,
@@ -558,6 +559,7 @@ def eval_at(e2p, source_box_nr, rscale):
558559
target_boxes=e2p_target_boxes,
559560
box_target_starts=e2p_box_target_starts,
560561
box_target_counts_nonchild=e2p_box_target_counts_nonchild,
562+
max_ntargets_in_one_box=ntargets,
561563
centers=centers,
562564
targets=targets,
563565

0 commit comments

Comments
 (0)