[sharding_in_types] Add slice_p and squeeze_p sharding rule to make flash attention work in backward pass

yashk2810 · Google-ML-Automation · commit 9b941808463e · 2024-11-20T14:31:07.000-08:00
For `slice_p`'s sharding rule, I error out if the operand dim is sharded and the output dim is not divisible by that axis size.

I am working on a design to make JAX support uneven sharding at the top level after which slice_p's sharding rule can just `return operand.sharding`. Another option is to add `out_sharding` to `slice` but after uneven sharding support lands, it won't be necessary.

PiperOrigin-RevId: 698522980
diff --git a/jax/_src/lax/lax.py b/jax/_src/lax/lax.py
@@ -4527,6 +4527,12 @@ def _squeeze_dtype_rule(operand, *, dimensions):
 def _squeeze_shape_rule(operand, *, dimensions):
   return _compute_squeeze_shape(np.shape(operand), dimensions)
 
+def _squeeze_sharding_rule(operand, *, dimensions):
+  dims_set = set(dimensions)
+  new_spec = tuple(s for i, s in enumerate(operand.sharding.spec)
+                   if i not in dims_set)
+  return NamedSharding(operand.sharding.mesh, P(*new_spec))
+
 def _compute_squeeze_shape(shape, dimensions):
   dims_set = set(dimensions)
   if len(dims_set) != len(dimensions):
@@ -4555,15 +4561,19 @@ def _squeeze_batch_rule(batched_args, batch_dims, *, dimensions):
   return squeeze(operand, dimensions=dimensions), bdim_out
 
 squeeze_p = standard_primitive(_squeeze_shape_rule, _squeeze_dtype_rule,
-                               'squeeze')
+                               'squeeze', sharding_rule=_squeeze_sharding_rule)
 ad.deflinear2(squeeze_p, _squeeze_transpose_rule)
 batching.primitive_batchers[squeeze_p] = _squeeze_batch_rule
 pe.def_trivial_padding(squeeze_p)
 batching.ragged_prop_rules[squeeze_p] = batching.ragged_mask_no_op_rule
 
 def _squeeze_lower(ctx, operand, *, dimensions):
   del dimensions  # Implied by the output aval.
-  return [mlir.reshape(ctx, operand, ctx.avals_out[0])]
+  aval_out, = ctx.avals_out
+  out = mlir.reshape(ctx, operand, aval_out)
+  if config.sharding_in_types.value:
+    return [mlir.lower_sharding_under_shit(ctx, out, aval_out)]
+  return [out]
 
 mlir.register_lowering(squeeze_p, _squeeze_lower)
 
diff --git a/jax/_src/lax/slicing.py b/jax/_src/lax/slicing.py
@@ -42,6 +42,7 @@
     _input_dtype,
     standard_primitive,
 )
+from jax._src.sharding_impls import NamedSharding, PartitionSpec as P
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import hlo
 from jax._src.typing import Array, ArrayLike, Shape
@@ -1270,6 +1271,29 @@ def _slice_shape_rule(operand, *, start_indices, limit_indices, strides):
   return tuple(core.stride_dim(d, window_size=1, window_stride=s)
                for d, s in zip(diff, strides))
 
+def _get_sub_spec_size(mesh, sub_spec):
+  if isinstance(sub_spec, tuple):
+    return math.prod(mesh.shape[s] for s in sub_spec)
+  return mesh.shape[sub_spec]
+
+def _slice_sharding_rule(operand, *, start_indices, limit_indices, strides):
+  # TODO(yashkatariya): Once JAX supports uneven sharding at the top level,
+  # change this logic to `return operand.sharding` directly.
+  out_shape = _slice_shape_rule(operand, start_indices=start_indices,
+                                limit_indices=limit_indices, strides=strides)
+  mesh = operand.sharding.mesh
+  new_spec = []
+  for op_sh, out_sh, op_spec in safe_zip(
+      operand.shape, out_shape, operand.sharding.spec):
+    if (op_sh != out_sh and op_spec is not None and
+        out_sh % _get_sub_spec_size(mesh, op_spec) != 0):
+      raise NotImplementedError(
+          f"slicing on sharded dims where out dim ({out_sh}) is not divisble by"
+          f" mesh axes ({_get_sub_spec_size(mesh, op_spec)}) with spec"
+          f" ({op_spec}) is not implemented.")
+    new_spec.append(op_spec)
+  return NamedSharding(mesh, P(*new_spec))
+
 def _slice_transpose_rule(t, operand, *, start_indices, limit_indices, strides):
   assert ad.is_undefined_primal(operand)
   operand_shape = operand.aval.shape
@@ -1308,7 +1332,8 @@ def _slice_batching_rule(batched_args, batch_dims, *, start_indices,
   out = slice(operand, new_start_indices, new_limit_indices, new_strides)
   return out, bdim
 
-slice_p = standard_primitive(_slice_shape_rule, _input_dtype, 'slice')
+slice_p = standard_primitive(_slice_shape_rule, _input_dtype, 'slice',
+                             sharding_rule=_slice_sharding_rule)
 ad.deflinear2(slice_p, _slice_transpose_rule)
 batching.primitive_batchers[slice_p] = _slice_batching_rule
 # TODO(mvoz): A better slice rule for ragged prop, enforcing boundaries
@@ -1333,8 +1358,11 @@ def _slice_impl(x, start_indices, limit_indices, strides):
 def _slice_lower(ctx, x, *, start_indices, limit_indices, strides):
   strides = strides or [1] * len(start_indices)
   aval_out, = ctx.avals_out
-  return [mlir.slice_op(ctx, x, aval_out,
-                        start_indices=start_indices, limit_indices=limit_indices, strides=strides)]
+  out = mlir.slice_op(ctx, x, aval_out, start_indices=start_indices,
+                      limit_indices=limit_indices, strides=strides)
+  if config.sharding_in_types.value:
+    return [mlir.lower_sharding_under_shit(ctx, out, aval_out)]
+  return [out]
 
 mlir.register_lowering(slice_p, _slice_lower)
 
diff --git a/jax/_src/pallas/core.py b/jax/_src/pallas/core.py
@@ -219,6 +219,10 @@ def __init__(self, inner_aval: jax_core.ShapedArray, memory_space: Any):
   def __repr__(self) -> str:
     return f'MemRef<{self.memory_space}>{{{self.inner_aval.str_short()}}}'
 
+  @property
+  def sharding(self):
+    return self.inner_aval.sharding
+
   def update_weak_type(self, weak_type):
     return AbstractMemoryRef(
         self.inner_aval.update_weak_type(weak_type), self.memory_space)
diff --git a/tests/pjit_test.py b/tests/pjit_test.py
@@ -5285,6 +5285,43 @@ def f(x, y):
     self.assertArraysEqual(out, (np_inp @ np_inp.T) * 2)
     self.assertEqual(out.sharding, NamedSharding(mesh, P('x', None)))
 
+  def test_slice(self):
+    mesh = jtu.create_mesh((2, 2), ('x', 'y'))
+    np_inp = np.arange(16).reshape(4, 4)
+    arr = jax.device_put(np_inp, NamedSharding(mesh, P('x', None)))
+
+    @jax.jit
+    def f(x):
+      y = lax.slice(x, (0, 0), (4, 3))
+      self.assertEqual(y.sharding.spec, P('x', None))
+      return y
+
+    out = f(arr)
+    self.assertEqual(out.sharding, NamedSharding(mesh, P('x', None)))
+    self.assertIn('@Sharding', f.lower(arr).as_text())
+
+    with self.assertRaisesRegex(NotImplementedError, "slicing on sharded dims"):
+      f(jax.device_put(np_inp, NamedSharding(mesh, P('x', 'y'))))
+
+    with self.assertRaisesRegex(NotImplementedError, "slicing on sharded dims"):
+      f(jax.device_put(np_inp, NamedSharding(mesh, P(None, ('x', 'y')))))
+
+  def test_squeeze(self):
+    mesh = jtu.create_mesh((2, 2), ('x', 'y'))
+    np_inp = np.arange(16).reshape(4, 4, 1)
+    arr = jax.device_put(np_inp, NamedSharding(mesh, P('x', None, None)))
+
+    @jax.jit
+    def f(x):
+      y = lax.squeeze(x, (2,))
+      self.assertEqual(y.sharding.spec, P('x', None))
+      return y
+
+    out = f(arr)
+    self.assertEqual(out.sharding, NamedSharding(mesh, P('x', None)))
+    self.assertIn('@Sharding', f.lower(arr).as_text())
+    self.assertArraysEqual(out, np.squeeze(np_inp, axis=2))
+
 
 @jtu.pytest_mark_if_available('multiaccelerator')
 class PJitErrorTest(jtu.JaxTestCase):