[Tunix] Hide internal helper functions for 1p only in reshard.py.

lc5211 · The tunix Authors · commit a62a4a56f98f · 2026-03-26T15:35:16.000-07:00
PiperOrigin-RevId: 888783253
diff --git a/tunix/rl/reshard.py b/tunix/rl/reshard.py
@@ -58,257 +58,6 @@ def wait():
   threading.Thread(target=wait).start()
 
 
-def _identity(x):
-  return x
-
-
-INTERMEDIATE_SPLIT_SUFFIX = '_intermediate_split'
-INTERMEDIATE_REPLICA_SUFFIX = '_intermediate_replica'
-
-
-def _maybe_find_intermediate_sharding(source_sharding, target_sharding):
-  """Maybe finds an intermediate sharding to reshard to before target sharding.
-
-  This function tries to find an intermediate sharding that can be used to
-  reshard the source sharding to the target sharding. This is useful when
-  resharding from a source sharding to a target sharding that requires an
-  all-gather, which can be expensive.
-
-  For example, consider resharding an array from src_sharding (e.g., [fsdp: 8,
-  tp: 1]) to target_sharding (e.g., [fsdp: 1, tp: 4]). In this case, the source
-  has a larger sharding factor (8) than the target largest sharding factor (4)
-  on the tp dimension.
-  To avoid an expensive all-gather, we can introduce an intermediate sharding
-  (e.g., [fsdp_split: 4, fsdp_replica: 2, tp: 1]). This intermediate sharding
-  allows us to reshard the source array by still sharding along the fsdp
-  dimension and replicating it on the remaining devices. Then we can just
-  reshard any replica of the source to the target as normal.
-
-  Args:
-    source_sharding: The source sharding.
-    target_sharding: The target sharding.
-
-  Returns:
-    An intermediate sharding, or None if no intermediate sharding can be found.
-  """
-  if not isinstance(
-      source_sharding, jax.sharding.NamedSharding
-  ) or not isinstance(target_sharding, jax.sharding.NamedSharding):
-    logging.vlog(
-        2,
-        'None-NamedSharding does not need intermediate sharding.'
-        f' {source_sharding=}, {target_sharding=}',
-    )
-    return None
-  src_mesh = source_sharding.mesh
-  dst_mesh = target_sharding.mesh
-
-  def _get_sharding_dims(sharding, mesh):
-    sharding_dims = {}
-    for i, axis_name in enumerate(sharding.spec):
-      if axis_name is None:
-        sharding_dims[(i, None)] = 1
-      elif isinstance(axis_name, str):
-        sharding_dims[(i, mesh.axis_names.index(axis_name))] = mesh.shape[
-            axis_name
-        ]
-      elif isinstance(axis_name, (tuple, list)):
-        for _, axis in enumerate(axis_name):
-          if axis is None:
-            sharding_dims[(i, None)] = 1
-          # Only handles two-level logical axis rules for now.
-          elif isinstance(axis, str):
-            sharding_dims[(i, mesh.axis_names.index(axis))] = (
-                mesh.shape[axis]
-            )
-          else:
-            raise ValueError(f'Unsupported axis name: {axis_name}')
-      else:
-        raise ValueError(
-            f'Unsupported axis name: {axis_name} with type {type(axis_name)}'
-        )
-
-    largest_shards = max(sharding_dims.values()) if len(sharding_dims) else 1
-    if len(sharding_dims) < len(mesh.shape):
-      for mi, mesh_axis in enumerate(mesh.axis_names):
-        matched = any(mesh_axis == keys[1] for keys in sharding_dims)
-        if not matched:
-          sharding_dims[(None, mi)] = 1
-    return sharding_dims, largest_shards
-
-  src_sharding_dims, src_largest_shards = _get_sharding_dims(
-      source_sharding, src_mesh
-  )
-  dst_sharding_dims, dst_largest_shards = _get_sharding_dims(
-      target_sharding, dst_mesh
-  )
-  # Not able to handle resharding with undividable shardings.
-  if src_largest_shards % dst_largest_shards != 0:
-    logging.debug(
-        'Resharding with undividable shardings is not optimized with'
-        ' experimental pre-reshard.'
-        ' source_sharding=%s, target_sharding=%s',
-        source_sharding,
-        target_sharding,
-    )
-    return None
-
-  total_source_sharding_dims = math.prod(list(src_sharding_dims.values()))
-  total_dst_sharding_dims = math.prod(list(dst_sharding_dims.values()))
-  if (
-      total_source_sharding_dims <= total_dst_sharding_dims
-      or total_source_sharding_dims % total_dst_sharding_dims != 0
-  ):
-    return None
-
-  new_split_dim_shards = None
-  new_split_axis = None
-  replicas = src_largest_shards // dst_largest_shards
-
-  # Find gcd(src_dim_shards, dst_dim_shards),
-  # If all of them are 1s, an all-gather is needed as the single replica of
-  # the source cannot be presented by any sharded form on the target devices.
-  gcd_shards = []
-  for (sharding_mesh_axis_idx, src_dim_shards), (_, dst_dim_shards) in zip(
-      src_sharding_dims.items(), dst_sharding_dims.items()
-  ):
-    gcd_dim_shards = math.gcd(src_dim_shards, dst_dim_shards)
-    if gcd_dim_shards == 1:
-      if (
-          src_dim_shards > dst_dim_shards
-          and src_dim_shards == src_largest_shards
-      ):
-        new_split_axis = sharding_mesh_axis_idx
-        new_split_dim_shards = (src_dim_shards // replicas, replicas)
-    gcd_shards.append(gcd_dim_shards)
-
-  if math.prod(gcd_shards) != 1 or new_split_axis is None:
-    return None
-
-  # Generate the intermediate sharding.
-  new_split_mesh_axis_name = (
-      src_mesh.axis_names[new_split_axis[1]] + INTERMEDIATE_SPLIT_SUFFIX
-  )
-  new_split_mesh_replica_axis_name = (
-      src_mesh.axis_names[new_split_axis[1]] + INTERMEDIATE_REPLICA_SUFFIX
-  )
-  intermediate_mesh = jax.sharding.Mesh(
-      src_mesh.devices.reshape(
-          tuple(
-              list(src_mesh.devices.shape[: new_split_axis[1]])
-              + [new_split_dim_shards[0], new_split_dim_shards[1]]
-              + list(src_mesh.devices.shape[new_split_axis[1] + 1 :])
-          )
-      ),
-      axis_names=tuple(
-          list(src_mesh.axis_names[: new_split_axis[1]])
-          + [new_split_mesh_axis_name, new_split_mesh_replica_axis_name]
-          + list(src_mesh.axis_names[new_split_axis[1] + 1 :])
-      ),
-  )
-
-  intermediate_spec = tuple(
-      list(source_sharding.spec[: new_split_axis[0]])
-      + [new_split_mesh_axis_name]
-      + list(source_sharding.spec[new_split_axis[0] + 1 :])
-  )
-  intermediate_sharding = jax.sharding.NamedSharding(
-      intermediate_mesh,
-      jax.sharding.PartitionSpec(*intermediate_spec),
-      memory_kind=source_sharding.memory_kind,
-  )
-  return intermediate_sharding
-
-
-def _experimental_pre_reshard(splitfn, src_pytree, target_shardings):
-  """Simple heuristic to determine if resharding with replicated all-gather is needed.
-
-  A replicated all-gather often results to heavy HBM occupation which we need to
-  avoid. This funciton currently only handles the case like resharding from
-  [fsdp: 8, tp: 1] to [fsdp: 1, tp: 4].
-  We will improve the coverage on more complex cases along the development.
-
-  Args:
-    splitfn: The split function.
-    src_pytree: The source jax Array.
-    target_shardings: The target sharding.
-
-  Returns:
-    Pre-resharded src_pytree.
-  """
-  src_shardings = jax.tree_util.tree_map(
-      lambda x: x.sharding,
-      src_pytree,
-  )
-  intermediate_shardings = jax.tree_util.tree_map(
-      _maybe_find_intermediate_sharding,
-      src_shardings,
-      target_shardings,
-  )
-
-  src_leaves_with_path, src_treedef = jax.tree_util.tree_flatten_with_path(
-      src_pytree
-  )
-  intermediate_sharding_leaves_with_path, _ = (
-      jax.tree_util.tree_flatten_with_path(intermediate_shardings)
-  )
-  intermediate_sharding_leaves_with_path = {
-      path: intermediate_sharding
-      for path, intermediate_sharding in intermediate_sharding_leaves_with_path
-  }
-
-  to_split_src_pytree_leaves = {}
-  to_split_src_pytree_leaves_indexes = {}
-  to_split_intermediate_sharding_leaves = {}
-
-  intermediate_mesh = None
-  to_update_src_pytree_leaves = []
-
-  for i, (path, src) in enumerate(src_leaves_with_path):
-    to_update_src_pytree_leaves.append(src)
-    if intermediate_sharding := intermediate_sharding_leaves_with_path.get(
-        path, None
-    ):
-      # The to_split_axis should always be the same along all the intermediate
-      # shardings.
-      intermediate_mesh = intermediate_sharding.mesh
-      to_split_src_pytree_leaves.setdefault(intermediate_mesh, []).append(src)
-      to_split_src_pytree_leaves_indexes.setdefault(intermediate_mesh, []).append(i)
-      to_split_intermediate_sharding_leaves.setdefault(intermediate_mesh, []).append(intermediate_sharding)
-
-  if intermediate_mesh is None:
-    # No pre-resharding is needed.
-    return src_pytree
-
-  for _intermediate_mesh in to_split_src_pytree_leaves.keys():
-    to_split_axis = None
-    for axis_name in _intermediate_mesh.axis_names:
-      if axis_name.endswith(INTERMEDIATE_REPLICA_SUFFIX):
-        to_split_axis = axis_name
-        break
-    assert (
-        to_split_axis is not None
-    ), f'No replica axis found in the intermediate mesh {_intermediate_mesh}.'
-
-    temp_source = jax.jit(
-        _identity,
-        out_shardings=to_split_intermediate_sharding_leaves[_intermediate_mesh],
-    )(to_split_src_pytree_leaves[_intermediate_mesh])
-
-    # Update the to_split_src_pytree_leaves with the new splitted array.
-    updated_to_split_src_pytree_leaves, *_ = splitfn(temp_source, to_split_axis)
-
-    for i in range(len(to_split_src_pytree_leaves_indexes[_intermediate_mesh])):
-      to_update_src_pytree_leaves[
-          to_split_src_pytree_leaves_indexes[_intermediate_mesh][i]
-      ] = updated_to_split_src_pytree_leaves[i]
-
-  updated_src_pytree = jax.tree_util.tree_unflatten(
-      src_treedef, to_update_src_pytree_leaves
-  )
-  return updated_src_pytree
-
-
 #