Skip to content

Commit f39226d

Browse files
committed
Need sync every two rows for biharmonic
1 parent 9e14ffb commit f39226d

File tree

1 file changed

+93
-21
lines changed

1 file changed

+93
-21
lines changed

sumpy/expansion/loopy.py

Lines changed: 93 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import sumpy.symbolic as sym
2828
from sumpy.assignment_collection import SymbolicAssignmentCollection
2929
from sumpy.tools import gather_loopy_arguments
30-
from math import prod
30+
from math import prod, gcd
3131

3232
import logging
3333
logger = logging.getLogger(__name__)
@@ -239,6 +239,9 @@ def l2p_loopy_volume_taylor(expansion, kernels):
239239
expression=coeffs[icoeff],
240240
id="copy_coeffs",
241241
))
242+
# We need two rows for coeffs_copy since we cannot use inplace
243+
# updates due to parallel updates so we alternatively use
244+
# coeffs_copy[0, :] and coeffs_copy[1, :] to write and read from.
242245
temporary_variables.append(lp.TemporaryVariable(
243246
name="coeffs_copy",
244247
shape=(2, ncoeffs),
@@ -254,16 +257,34 @@ def l2p_loopy_volume_taylor(expansion, kernels):
254257
if all(m != 0 for m in max_mi):
255258
raise NotImplementedError("non-elliptic PDEs")
256259

257-
c = max_mi[axis_permutation[0]]
260+
slowest_axis = axis_permutation[0]
261+
c = max_mi[slowest_axis]
258262
v = [pymbolic.var(f"x{i}") for i in range(dim)]
259-
v[axis_permutation[0]], v[0] = v[0], v[axis_permutation[0]]
263+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
260264
x0 = v[0]
261265

262-
def get_domains(v, iorder):
266+
# sync_split is the maximum number of iterations in v[0] that we can do
267+
# before a synchronization is needed. For Laplace 2D there are two rows
268+
# of stored coeffs, and both of them can be calculated before a sync
269+
# is needed. For biharmonic 2D there are four rows in stored coeffs,
270+
# but synchronization needs to happen every two rows because calculating
271+
# the 6th row needs the 4th row synchronized
272+
sync_split = gcd(*[c - deriv_id.mi[slowest_axis]
273+
for deriv_id in deriv_id_to_coeff])
274+
275+
def get_domains(v, iorder, with_sync):
263276
domains = [f"{{ [{x0}_outer]: 0<={x0}_outer<={order//c} }}"]
264-
expr = f"{v[0]}_inner + {c}*{x0}_outer"
265-
domains += [f"{{ [{v[0]}_inner]: 0<={expr}<=order "
266-
f"and 0<={v[0]}_inner<{c} }}"]
277+
if with_sync:
278+
expr = f"{c//sync_split}*{x0}_sync_outer + {c}*{x0}_outer"
279+
domains += [f"{{ [{x0}_sync_outer]: 0<={expr}<=order "
280+
f"and 0<={x0}_sync_outer<{c//sync_split} }}"]
281+
expr += f" + {v[0]}_inner"
282+
domains += [f"{{ [{v[0]}_inner]: 0<={expr}<=order "
283+
f"and 0<={v[0]}_inner<{sync_split} }}"]
284+
else:
285+
expr = f"{v[0]}_inner + {c}*{x0}_outer"
286+
domains += [f"{{ [{v[0]}_inner]: 0<={expr}<=order "
287+
f"and 0<={v[0]}_inner<{c} }}"]
267288
domains += [f"{{ [{v[0]}]: {expr}<={v[0]}<={expr} }}"]
268289
domains += [f"{{ [{iorder}]: {v[0]}<={iorder}<=order }}"]
269290
upper_bound = f"{iorder}-{v[0]}"
@@ -280,9 +301,37 @@ def get_idx(v):
280301
idx = wrangler.get_storage_index(idx_sym)
281302
return idx
282303

283-
domains += get_domains(v, iorder)
304+
domains += get_domains(v, iorder, with_sync=True)
284305
idx = get_idx(v)
285306

307+
if c == sync_split:
308+
# We do not need to sync within the c rows.
309+
# Update the values from the c rows set coeffs_copy[p, :] from
310+
# the previous c rows set coeffs_copy[p-1, :]
311+
# and then read from coeffs_copy[p, :].
312+
# This code path is different to avoid an extra copy and
313+
# a synchronization step.
314+
prev_copy_idx = (v[0]//c - 1) % 2
315+
curr_copy_idx = (v[0]//c) % 2
316+
fetch_idx = (v[0]//c) % 2
317+
else:
318+
# We need to sync within the c rows.
319+
# Using the biharmonic 2D example:
320+
# - Update the rows 4, 5 at coeffs_copy[1, :] from values at
321+
# coeffs_copy[0, :]
322+
# - Synchronize
323+
# - Copy the rows 4, 5 from coeffs_copy[1, :] to coeffs_copy[0, :]
324+
# - Synchronize
325+
# - Update the rows 6, 7 at coeffs_copy[1, :] from values at
326+
# coeffs_copy[0, :]
327+
# - Synchronize
328+
# - Copy the rows 6, 7 from coeffs_copy[1, :] to coeffs_copy[0, :]
329+
# - Synchronize
330+
# - Read the rows 4, 5, 6, 7 from coeffs_copy[0, :]
331+
prev_copy_idx = 0
332+
curr_copy_idx = 1
333+
fetch_idx = 0
334+
286335
max_mi_sym = [v[i] - max_mi[i] for i in range(dim)]
287336
scale = -1/deriv_id_to_coeff[max_deriv_id]
288337
expr = 0
@@ -291,25 +340,42 @@ def get_idx(v):
291340
continue
292341
mi_sym = [max_mi_sym[i] + deriv_id.mi[i] for i in range(dim)]
293342
mi_sym[0] = mi_sym[0] % c
294-
expr += (coeffs_copy[(v[0]//c + 1) % 2,
343+
expr += (coeffs_copy[prev_copy_idx,
295344
wrangler.get_storage_index(mi_sym)]
296345
* (rscale**(sum(max_mi) - sum(deriv_id.mi))
297346
* pymbolic_conv(pde_coeff) * scale))
298347

299348
insns.append(lp.Assignment(
300-
assignee=coeffs_copy[(v[0]//c) % 2, idx],
349+
assignee=coeffs_copy[curr_copy_idx, idx],
301350
expression=expr,
302351
id="update_coeffs",
303352
depends_on=frozenset(["copy_coeffs"]),
304353
depends_on_is_final=True,
305354
predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
306355
))
307356

357+
if c != sync_split:
358+
# We now copy before synchronization
359+
v = [pymbolic.var(f"z{i}") for i in range(dim)]
360+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
361+
iorder = pymbolic.var("iorder3")
362+
idx = get_idx(v)
363+
domains += get_domains(v, iorder, with_sync=True)[2:]
364+
365+
insns.append(lp.Assignment(
366+
assignee=coeffs_copy[0, idx],
367+
expression=coeffs_copy[1, idx],
368+
id="copy_sync",
369+
depends_on=frozenset(["update_coeffs"]),
370+
depends_on_is_final=True,
371+
predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
372+
))
373+
308374
v = [pymbolic.var(f"y{i}") for i in range(dim)]
309-
v[axis_permutation[0]], v[0] = v[0], v[axis_permutation[0]]
375+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
310376
iorder = pymbolic.var("iorder2")
311377
idx = get_idx(v)
312-
domains += get_domains(v, iorder)[1:]
378+
domains += get_domains(v, iorder, with_sync=False)[1:]
313379

314380
for ikernel, expr_dict in enumerate(sym_expr_dicts):
315381
expr = sum(coeff * prod(powers[i,
@@ -320,20 +386,26 @@ def get_idx(v):
320386
insn = lp.Assignment(
321387
assignee=result[ikernel],
322388
expression=(result[ikernel]
323-
+ coeffs_copy[(v[0]//c) % 2, idx] * expr),
389+
+ coeffs_copy[fetch_idx, idx] * expr),
324390
id=f"write_{ikernel}",
325-
depends_on=frozenset(["update_monomials", "update_coeffs"]),
391+
depends_on=frozenset(["update_monomials",
392+
"update_coeffs" if c == sync_split else "copy_sync"]),
326393
depends_on_is_final=True,
327394
)
328395
insns.append(insn)
396+
397+
tags = {
398+
"e2p_iorder1": "l.0",
399+
f"e2p_{x0}_outer": "unr",
400+
f"e2p_{x0}_inner": "unr",
401+
f"e2p_{v[0]}_inner": "unr",
402+
"e2p_iorder2": "unr",
403+
}
404+
if c != sync_split:
405+
tags["e2p_iorder3"] = "l.0"
406+
329407
optimizations += [
330-
lambda knl: lp.tag_inames(knl, {
331-
"e2p_iorder1": "l.0",
332-
f"e2p_{x0}_outer": "unr",
333-
f"e2p_{x0}_inner": "unr",
334-
f"e2p_{v[0]}_inner": "unr",
335-
"e2p_iorder2": "unr",
336-
}),
408+
lambda knl: lp.tag_inames(knl, tags),
337409
lambda knl: lp.set_temporary_address_space(knl, "e2p_coeffs_copy",
338410
lp.AddressSpace.LOCAL),
339411
lambda knl: lp.split_iname(knl, "e2p_icoeff", 32, inner_tag="l.0"),

0 commit comments

Comments
 (0)