Need sync every two rows for biharmonic

isuruf · isuruf · commit f39226dff0a7 · 2023-02-17T03:37:01.000-06:00
diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py
@@ -27,7 +27,7 @@
 import sumpy.symbolic as sym
 from sumpy.assignment_collection import SymbolicAssignmentCollection
 from sumpy.tools import gather_loopy_arguments
-from math import prod
+from math import prod, gcd
 
 import logging
 logger = logging.getLogger(__name__)
@@ -239,6 +239,9 @@ def l2p_loopy_volume_taylor(expansion, kernels):
             expression=coeffs[icoeff],
             id="copy_coeffs",
         ))
+        # We need two rows for coeffs_copy since we cannot use inplace
+        # updates due to parallel updates so we alternatively use
+        # coeffs_copy[0, :] and coeffs_copy[1, :] to write and read from.
         temporary_variables.append(lp.TemporaryVariable(
             name="coeffs_copy",
             shape=(2, ncoeffs),
@@ -254,16 +257,34 @@ def l2p_loopy_volume_taylor(expansion, kernels):
         if all(m != 0 for m in max_mi):
             raise NotImplementedError("non-elliptic PDEs")
 
-        c = max_mi[axis_permutation[0]]
+        slowest_axis = axis_permutation[0]
+        c = max_mi[slowest_axis]
         v = [pymbolic.var(f"x{i}") for i in range(dim)]
-        v[axis_permutation[0]], v[0] = v[0], v[axis_permutation[0]]
+        v[slowest_axis], v[0] = v[0], v[slowest_axis]
         x0 = v[0]
 
-        def get_domains(v, iorder):
+        # sync_split is the maximum number of iterations in v[0] that we can do
+        # before a synchronization is needed. For Laplace 2D there are two rows
+        # of stored coeffs, and both of them can be calculated before a sync
+        # is needed. For biharmonic 2D there are four rows in stored coeffs,
+        # but synchronization needs to happen every two rows because calculating
+        # the 6th row needs the 4th row synchronized
+        sync_split = gcd(*[c - deriv_id.mi[slowest_axis]
+                         for deriv_id in deriv_id_to_coeff])
+
+        def get_domains(v, iorder, with_sync):
             domains = [f"{{ [{x0}_outer]: 0<={x0}_outer<={order//c} }}"]
-            expr = f"{v[0]}_inner + {c}*{x0}_outer"
-            domains += [f"{{ [{v[0]}_inner]: 0<={expr}<=order "
-                f"and 0<={v[0]}_inner<{c} }}"]
+            if with_sync:
+                expr = f"{c//sync_split}*{x0}_sync_outer + {c}*{x0}_outer"
+                domains += [f"{{ [{x0}_sync_outer]: 0<={expr}<=order "
+                    f"and 0<={x0}_sync_outer<{c//sync_split} }}"]
+                expr += f" + {v[0]}_inner"
+                domains += [f"{{ [{v[0]}_inner]: 0<={expr}<=order "
+                    f"and 0<={v[0]}_inner<{sync_split} }}"]
+            else:
+                expr = f"{v[0]}_inner + {c}*{x0}_outer"
+                domains += [f"{{ [{v[0]}_inner]: 0<={expr}<=order "
+                    f"and 0<={v[0]}_inner<{c} }}"]
             domains += [f"{{ [{v[0]}]: {expr}<={v[0]}<={expr} }}"]
             domains += [f"{{ [{iorder}]: {v[0]}<={iorder}<=order }}"]
             upper_bound = f"{iorder}-{v[0]}"
@@ -280,9 +301,37 @@ def get_idx(v):
             idx = wrangler.get_storage_index(idx_sym)
             return idx
 
-        domains += get_domains(v, iorder)
+        domains += get_domains(v, iorder, with_sync=True)
         idx = get_idx(v)
 
+        if c == sync_split:
+            # We do not need to sync within the c rows.
+            # Update the values from the c rows set coeffs_copy[p, :] from
+            # the previous c rows set coeffs_copy[p-1, :]
+            # and then read from coeffs_copy[p, :].
+            # This code path is different to avoid an extra copy and
+            # a synchronization step.
+            prev_copy_idx = (v[0]//c - 1) % 2
+            curr_copy_idx = (v[0]//c) % 2
+            fetch_idx = (v[0]//c) % 2
+        else:
+            # We need to sync within the c rows.
+            # Using the biharmonic 2D example:
+            # - Update the rows 4, 5 at coeffs_copy[1, :] from values at
+            #     coeffs_copy[0, :]
+            # - Synchronize
+            # - Copy the rows 4, 5 from coeffs_copy[1, :] to coeffs_copy[0, :]
+            # - Synchronize
+            # - Update the rows 6, 7 at coeffs_copy[1, :] from values at
+            #     coeffs_copy[0, :]
+            # - Synchronize
+            # - Copy the rows 6, 7 from coeffs_copy[1, :] to coeffs_copy[0, :]
+            # - Synchronize
+            # - Read the rows 4, 5, 6, 7 from coeffs_copy[0, :]
+            prev_copy_idx = 0
+            curr_copy_idx = 1
+            fetch_idx = 0
+
         max_mi_sym = [v[i] - max_mi[i] for i in range(dim)]
         scale = -1/deriv_id_to_coeff[max_deriv_id]
         expr = 0
@@ -291,25 +340,42 @@ def get_idx(v):
                 continue
             mi_sym = [max_mi_sym[i] + deriv_id.mi[i] for i in range(dim)]
             mi_sym[0] = mi_sym[0] % c
-            expr += (coeffs_copy[(v[0]//c + 1) % 2,
+            expr += (coeffs_copy[prev_copy_idx,
                 wrangler.get_storage_index(mi_sym)]
                      * (rscale**(sum(max_mi) - sum(deriv_id.mi))
                      * pymbolic_conv(pde_coeff) * scale))
 
         insns.append(lp.Assignment(
-            assignee=coeffs_copy[(v[0]//c) % 2, idx],
+            assignee=coeffs_copy[curr_copy_idx, idx],
             expression=expr,
             id="update_coeffs",
             depends_on=frozenset(["copy_coeffs"]),
             depends_on_is_final=True,
             predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
         ))
 
+        if c != sync_split:
+            # We now copy before synchronization
+            v = [pymbolic.var(f"z{i}") for i in range(dim)]
+            v[slowest_axis], v[0] = v[0], v[slowest_axis]
+            iorder = pymbolic.var("iorder3")
+            idx = get_idx(v)
+            domains += get_domains(v, iorder, with_sync=True)[2:]
+
+            insns.append(lp.Assignment(
+                assignee=coeffs_copy[0, idx],
+                expression=coeffs_copy[1, idx],
+                id="copy_sync",
+                depends_on=frozenset(["update_coeffs"]),
+                depends_on_is_final=True,
+                predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
+            ))
+
         v = [pymbolic.var(f"y{i}") for i in range(dim)]
-        v[axis_permutation[0]], v[0] = v[0], v[axis_permutation[0]]
+        v[slowest_axis], v[0] = v[0], v[slowest_axis]
         iorder = pymbolic.var("iorder2")
         idx = get_idx(v)
-        domains += get_domains(v, iorder)[1:]
+        domains += get_domains(v, iorder, with_sync=False)[1:]
 
         for ikernel, expr_dict in enumerate(sym_expr_dicts):
             expr = sum(coeff * prod(powers[i,
@@ -320,20 +386,26 @@ def get_idx(v):
             insn = lp.Assignment(
                 assignee=result[ikernel],
                 expression=(result[ikernel]
-                    + coeffs_copy[(v[0]//c) % 2, idx] * expr),
+                    + coeffs_copy[fetch_idx, idx] * expr),
                 id=f"write_{ikernel}",
-                depends_on=frozenset(["update_monomials", "update_coeffs"]),
+                depends_on=frozenset(["update_monomials",
+                    "update_coeffs" if c == sync_split else "copy_sync"]),
                 depends_on_is_final=True,
             )
             insns.append(insn)
+
+        tags = {
+            "e2p_iorder1": "l.0",
+            f"e2p_{x0}_outer": "unr",
+            f"e2p_{x0}_inner": "unr",
+            f"e2p_{v[0]}_inner": "unr",
+            "e2p_iorder2": "unr",
+        }
+        if c != sync_split:
+            tags["e2p_iorder3"] = "l.0"
+
         optimizations += [
-            lambda knl: lp.tag_inames(knl, {
-                "e2p_iorder1": "l.0",
-                f"e2p_{x0}_outer": "unr",
-                f"e2p_{x0}_inner": "unr",
-                f"e2p_{v[0]}_inner": "unr",
-                "e2p_iorder2": "unr",
-            }),
+            lambda knl: lp.tag_inames(knl, tags),
             lambda knl: lp.set_temporary_address_space(knl, "e2p_coeffs_copy",
                 lp.AddressSpace.LOCAL),
             lambda knl: lp.split_iname(knl, "e2p_icoeff", 32, inner_tag="l.0"),