Add sharding argument to reshape since it also takes a shape argument for the output shape

yashk2810 · Google-ML-Automation · commit 59e13f81146a · 2024-11-25T18:16:08.000-08:00
PiperOrigin-RevId: 700163883
diff --git a/jax/_src/lax/lax.py b/jax/_src/lax/lax.py
@@ -1231,7 +1231,8 @@ def broadcast_to_rank(x: ArrayLike, rank: int) -> Array:
   return broadcast(x, (1,) * (rank - ndim))
 
 def reshape(operand: ArrayLike, new_sizes: Shape,
-            dimensions: Sequence[int] | None = None) -> Array:
+            dimensions: Sequence[int] | None = None,
+            sharding: NamedSharding | None = None) -> Array:
   """Wraps XLA's `Reshape
   <https://www.tensorflow.org/xla/operation_semantics#reshape>`_
   operator.
@@ -1285,7 +1286,8 @@ def reshape(operand: ArrayLike, new_sizes: Shape,
 
     return reshape_p.bind(
       operand, *dyn_shape, new_sizes=tuple(static_new_sizes),
-      dimensions=None if dims is None or same_dims else dims)
+      dimensions=None if dims is None or same_dims else dims,
+      sharding=sharding)
 
 def pad(operand: ArrayLike, padding_value: ArrayLike,
         padding_config: Sequence[tuple[int, int, int]]) -> Array:
@@ -4654,7 +4656,7 @@ def shape_as_value(shape: core.Shape):
   ]
   return concatenate(dims, dimension=0)
 
-def _reshape_shape_rule(operand, *, new_sizes, dimensions):
+def _reshape_shape_rule(operand, *, new_sizes, dimensions, sharding):
   if not all(d >= 0 for d in new_sizes):
     msg = 'reshape new_sizes must all be positive, got {}.'
     raise TypeError(msg.format(new_sizes))
@@ -4674,7 +4676,9 @@ def _reshape_shape_rule(operand, *, new_sizes, dimensions):
       raise TypeError(msg.format(dimensions, np.shape(operand)))
   return tuple(new_sizes)
 
-def _reshape_sharding_rule(operand, *, new_sizes, dimensions):
+def _reshape_sharding_rule(operand, *, new_sizes, dimensions, sharding):
+  if sharding is not None:
+    return sharding
   filtered_spec = [
       (sh, sp) for sh, sp in zip(operand.shape, operand.sharding.spec)
       if sh != 1
@@ -4687,14 +4691,18 @@ def _reshape_sharding_rule(operand, *, new_sizes, dimensions):
     else:
       sh, sp = next(fs)
       if n != sh:
-        raise NotImplementedError
+        raise ValueError(
+            'This reshape is not supported. Please specify the sharding of the'
+            ' output via the `sharding` argument of reshape.')
       new_spec.append(sp)
   return operand.sharding.with_spec(new_spec)
 
-def _reshape_typecheck_rule(_, operand, *dyn_shape, new_sizes, dimensions):
+def _reshape_typecheck_rule(_, operand, *dyn_shape, new_sizes, dimensions,
+                            sharding):
   if not dyn_shape:
     out_aval, effects = reshape_p.abstract_eval(
-        operand.aval, new_sizes=new_sizes, dimensions=dimensions)
+        operand.aval, new_sizes=new_sizes, dimensions=dimensions,
+        sharding=sharding)
     return [out_aval], effects
   else:
     # TODO(mattjj, necula): perform more checks like _reshape_shape_rule
@@ -4705,18 +4713,29 @@ def _reshape_typecheck_rule(_, operand, *dyn_shape, new_sizes, dimensions):
     return [out_aval], core.no_effects
 
 
-def _reshape_dtype_rule(operand, *, new_sizes, dimensions):
+def _reshape_dtype_rule(operand, *, new_sizes, dimensions, sharding):
   return operand.dtype
 
-def _reshape_transpose_rule(t, operand, *, new_sizes, dimensions):
+def _reshape_transpose_rule(t, operand, *, new_sizes, dimensions, sharding):
   assert ad.is_undefined_primal(operand)
   if dimensions is None:
+    if config.sharding_in_types.value:
+      return [reshape(t, operand.aval.shape, sharding=operand.aval.sharding)]
     return [reshape(t, operand.aval.shape)]
   else:
-    return [transpose(reshape(t, np.take(operand.aval.shape, dimensions)),
+    if config.sharding_in_types.value:
+      t_s = operand.sharding.with_spec(
+          tuple(map(str, np.take(operand.aval.sharding.spec, dimensions))))
+    else:
+      t_s = None
+    return [transpose(reshape(t, np.take(operand.aval.shape, dimensions),
+                              sharding=t_s),
                       np.argsort(dimensions))]
 
-def _reshape_batch_rule(batched_args, batch_dims, *, new_sizes, dimensions):
+def _reshape_batch_rule(batched_args, batch_dims, *, new_sizes, dimensions,
+                        sharding):
+  if sharding is not None:
+    raise NotImplementedError
   operand, = batched_args
   bdim, = batch_dims
   operand = batching.moveaxis(operand, bdim, 0)
@@ -4725,20 +4744,22 @@ def _reshape_batch_rule(batched_args, batch_dims, *, new_sizes, dimensions):
   return reshape(operand, operand.shape[:1] + new_sizes, dimensions), 0
 
 
-def _reshape_lower(ctx, x, *dyn_shape, new_sizes, dimensions):
+def _reshape_lower(ctx, x, *dyn_shape, new_sizes, dimensions, sharding):
   aval_out, = ctx.avals_out
   if dimensions is not None:
     x = hlo.transpose(x, mlir.dense_int_array(dimensions))
   if dyn_shape:
     aval_out = aval_out.update(shape=_merge_dyn_shape(new_sizes, dyn_shape))
   out = mlir.reshape(ctx, x, aval_out)
   if config.sharding_in_types.value:
+    if sharding is not None:
+      assert sharding == aval_out.sharding
     return [mlir.lower_sharding_under_shit(ctx, out, aval_out)]
   return [out]
 
 def _reshape_staging_rule(
-    trace, x, *dyn, new_sizes, dimensions):
-  params = dict(new_sizes=new_sizes, dimensions=dimensions)
+    trace, x, *dyn, new_sizes, dimensions, sharding):
+  params = dict(new_sizes=new_sizes, dimensions=dimensions, sharding=sharding)
   if not dyn:
     return trace.default_process_primitive(reshape_p, (x,), params)
   av = core.DShapedArray(_merge_dyn_shape(new_sizes, dyn), x.dtype, x.weak_type)
diff --git a/jax/_src/pallas/mosaic/lowering.py b/jax/_src/pallas/mosaic/lowering.py
@@ -1849,7 +1849,8 @@ def _convert_element_type_lowering_rule(
 lowering_rules[lax.convert_element_type_p] = _convert_element_type_lowering_rule
 
 
-def _reshape_lowering_rule(ctx: LoweringRuleContext, x, new_sizes, dimensions):
+def _reshape_lowering_rule(ctx: LoweringRuleContext, x, new_sizes, dimensions,
+                           sharding):
   if dimensions is not None:
     raise NotImplementedError
   if any(d is None for d in new_sizes):
diff --git a/jax/_src/pallas/triton/lowering.py b/jax/_src/pallas/triton/lowering.py
@@ -1612,7 +1612,7 @@ def _squeeze_lowering_rule(ctx: LoweringRuleContext, a, *, dimensions):
 
 @register_lowering(lax.reshape_p)
 def _reshape_lowering_rule(
-    ctx: LoweringRuleContext, a, *, new_sizes, dimensions
+    ctx: LoweringRuleContext, a, *, new_sizes, dimensions, sharding,
 ):
   del new_sizes  # Unused.
   if dimensions is not None:
diff --git a/jax/experimental/jax2tf/jax2tf.py b/jax/experimental/jax2tf/jax2tf.py
@@ -2291,7 +2291,7 @@ def _empty(*, dtype):
 tf_impl[lax_internal.empty_p] = _empty
 
 
-def _reshape(operand, *, new_sizes, dimensions, _in_avals, _out_aval):
+def _reshape(operand, *, new_sizes, dimensions, sharding, _in_avals, _out_aval):
   if dimensions is None:
     dimensions = tf.range(tf.rank(operand))
   new_sizes_tf = _eval_shape(new_sizes, _in_avals[0].dtype)
diff --git a/jax/experimental/sparse/bcoo.py b/jax/experimental/sparse/bcoo.py
@@ -1826,7 +1826,9 @@ def bcoo_concatenate(operands: Sequence[BCOO], *, dimension: int) -> BCOO:
   return BCOO((new_data, new_indices), shape=out_aval.shape)
 
 
-def bcoo_reshape(mat: BCOO, *, new_sizes: Sequence[int], dimensions: Sequence[int] | None = None) -> BCOO:
+def bcoo_reshape(mat: BCOO, *, new_sizes: Sequence[int],
+                 dimensions: Sequence[int] | None = None,
+                 sharding=None) -> BCOO:
   """Sparse implementation of {func}`jax.lax.reshape`.
 
   Args:
diff --git a/tests/pjit_test.py b/tests/pjit_test.py
@@ -5147,31 +5147,37 @@ def h2(x, y):
   @parameterized.named_parameters(
       ('1', (16, 1), (1, 16, 1), P('x', None), P(None, 'x', None), False),
       ('2', (8, 2, 1), (1, 16, 1), P('x', None, None), P(None, 'x', None), True),
-      ('3', (8, 1), (1, 4, 2), P('x', None), P(None, 'x', None), True)
+      ('3', (8, 1), (1, 4, 2), P('x', None), P(None, None, 'x'), True)
   )
-  def test_reshape(self, src_shape, dst_shape, src_spec, dst_spec, will_error):
+  def test_reshape(self, src_shape, dst_shape, src_spec, dst_spec,
+                   use_sharding_arg):
     mesh = jtu.create_mesh((2,), ('x',))
     np_inp = np.arange(math.prod(src_shape),
                        dtype=np.float32).reshape(src_shape)
     arr = jax.device_put(np_inp, NamedSharding(mesh, src_spec))
 
-    @jax.jit
-    def f(x):
-      y = jnp.reshape(x, dst_shape)
+    @partial(jax.jit, static_argnums=1)
+    def f(x, new_sharding):
+      y = lax.reshape(x, dst_shape, sharding=new_sharding)
       y = y * 2
       self.assertEqual(y.sharding.spec, dst_spec)
       return y
 
-    if will_error:
-      with self.assertRaises(NotImplementedError):
-        f(arr)
-    else:
-      out = f(arr)
-      self.assertEqual(out.sharding, NamedSharding(mesh, dst_spec))
-      self.assertArraysEqual(out, np_inp.reshape(dst_shape) * 2)
+    new_s = (NamedSharding(mesh.abstract_mesh, dst_spec)
+             if use_sharding_arg else None)
+    out = f(arr, new_s)
+    self.assertEqual(out.sharding, NamedSharding(mesh, dst_spec))
+    self.assertArraysEqual(out, np_inp.reshape(dst_shape) * 2)
+
+    lowered_text = f.lower(arr, new_s).as_text()
+    self.assertIn('@Sharding', lowered_text)
 
-      lowered_text = f.lower(arr).as_text()
-      self.assertIn('@Sharding', lowered_text)
+    def g(x):
+      out = f(x, new_s)
+      return jnp.square(jnp.sum(out))
+
+    out = jax.jit(jax.grad(g))(arr)
+    self.assertEqual(out.sharding, arr.sharding)
 
   def test_select(self):
     mesh = jtu.create_mesh((2, 2), ('x', 'y'))