[FEATURE] pass global config to worker and set manual sharding of intermediates (#928)

ZYHowell · web-flow · commit 872fe5a9b808 · 2023-05-19T14:06:20.000-07:00
diff --git a/alpa/device_mesh.py b/alpa/device_mesh.py
@@ -111,7 +111,7 @@ class MeshHostWorker:
 
     def __init__(self, server_address: str, num_hosts: int, host_id: int,
                  mesh_id: int, move_worker: DaemonMoveWorker,
-                 runtime_random_seed: int):
+                 runtime_random_seed: int, worker_global_config: dict):
         self.num_hosts = num_hosts
         self.host_id = host_id
         self.mesh_id = mesh_id
@@ -124,6 +124,9 @@ def __init__(self, server_address: str, num_hosts: int, host_id: int,
         self.distributed_client.connect()
         logger.debug(
             f"{host_id}: Success to connect to xla runtime at {server_address}")
+
+        # Set global config to follow the driver
+        global_config.update_worker_config(worker_global_config)
         if global_config.backend == "gpu":
             self.backend = xla_client.make_gpu_client(self.distributed_client,
                                                       node_id=host_id)
@@ -1139,7 +1142,8 @@ def launch_xla_servers(self):
                                      "env_vars": env_vars
                                  }).remote(server_address, self.num_hosts, i,
                                            self.mesh_id, move_worker,
-                                           global_config.runtime_random_seed)
+                                           global_config.runtime_random_seed,
+                                           global_config)
             workers.append(worker)
         return service_server, workers
 
diff --git a/alpa/global_env.py b/alpa/global_env.py
@@ -105,6 +105,36 @@ def ray_accelerator_name(self):
         backend_to_ray = {"gpu": "GPU"}
         return backend_to_ray[self.backend]
 
+    def update_worker_config(self, cfg: "GlobalConfig"):
+        """Update the worker config based on the host one"""
+        self.backend = cfg.backend
+        # Random seed used for compilation
+        self.compile_random_seed = cfg.compile_random_seed
+        # Random seed used for runtime
+        self.runtime_random_seed = cfg.runtime_random_seed
+        # XLA server port range
+        self.xla_server_port_start = cfg.xla_server_port_start
+        self.xla_server_port_end = cfg.xla_server_port_end
+        # XLA gpu kernel auto-tuning level
+        self.xla_gpu_autotune_level = cfg.xla_gpu_autotune_level
+        # Whether to use AWS EFA network interface
+        self.use_aws_efa = cfg.use_aws_efa
+        ########## Options of pipeline runtime ##########
+        # Whether to sync before and after the executable for accurate internal
+        # timer
+        self.pipeline_sync_for_timer = cfg.pipeline_sync_for_timer
+        # Whether to use single-byte signal tensor for send/recv.
+        # This is a debug option.
+        self.pipeline_use_signal_send_recv = cfg.pipeline_use_signal_send_recv
+        # Whether to use the scatter-gater/local-all-gather optimization.
+        self.use_local_allgather = cfg.use_local_allgather
+        # Cross mesh resharding mode. Possible choices: {"send_recv",
+        # "broadcast"}
+        self.resharding_mode = cfg.resharding_mode
+        self.nccl_mode = cfg.nccl_mode
+        self.enable_overlapping = cfg.enable_overlapping
+        self.collect_trace = cfg.collect_trace
+
 
 global_config = GlobalConfig()
 
diff --git a/alpa/pipeline_parallel/compile_executable.py b/alpa/pipeline_parallel/compile_executable.py
@@ -15,9 +15,7 @@
 from alpa.pipeline_parallel.pipeshard_executable import PipeshardDriverExecutable
 from alpa.pipeline_parallel.runtime_emitter import (
     OverlapFriendlyPipelineInstEmitter, PipelineInstEmitter)
-from alpa.pipeline_parallel.schedules import (GpipeSchedule,
-                                              OverlapFriendlyPipeDreamSchedule,
-                                              PipeDreamFlush, InferenceSchedule)
+from alpa.pipeline_parallel.schedules import create_pipeline_schedule
 from alpa.pipeline_parallel.computation import (
     create_donation_mapping, generate_computations_from_modules,
     generate_sharded_xla_computations,
@@ -38,6 +36,7 @@
 from alpa.shard_parallel.manual_sharding import (ManualShardingOption,
                                                  ParsedManualShardingOption,
                                                  get_flatten_axis_resources,
+                                                 get_intermediate_parsed_spec,
                                                  parsed_spec_to_opsharding)
 from alpa.util import (get_var_mapping, trace_jaxpr_with_micro_batch,
                        OrderedSet, GradFuncTransformContext)
@@ -198,31 +197,14 @@ def compile_pipeshard_executable_internal(
     debug_compilation_time("apply grad")
 
     # Generate pipeline schedule and placement
-    dependency = gen_dependency_with_stages(jax_pipeline_stages,
-                                            sliced_apply_grad_stages)
-    if pipeline_schedule == "gpipe":
-        schedule = GpipeSchedule(dependency=dependency,
-                                 meshes=sliced_virtual_meshes,
-                                 apply_grad_placement=apply_grad_placement,
-                                 num_batch=num_microbatch)
-    elif pipeline_schedule == "1f1b":
-        schedule = PipeDreamFlush(dependency=dependency,
-                                  meshes=sliced_virtual_meshes,
-                                  apply_grad_placement=apply_grad_placement,
-                                  num_batch=num_microbatch)
-    elif pipeline_schedule == "inference":
-        schedule = InferenceSchedule(dependency=dependency,
-                                     meshes=sliced_virtual_meshes,
-                                     apply_grad_placement=apply_grad_placement,
-                                     num_batch=num_microbatch)
-    elif pipeline_schedule == "1f1b_overlap_friendly":
-        schedule = OverlapFriendlyPipeDreamSchedule(
-            dependency=dependency,
-            meshes=sliced_virtual_meshes,
-            apply_grad_placement=apply_grad_placement,
-            num_batch=num_microbatch)
-    else:
-        raise ValueError(f"Invalid schedule: {pipeline_schedule}")
+    dependency, fwd_intermediates = gen_dependency_with_stages(
+        jax_pipeline_stages, num_meshes, sliced_apply_grad_stages)
+    schedule = create_pipeline_schedule(
+        pipeline_schedule,
+        dependency=dependency,
+        meshes=sliced_virtual_meshes,
+        apply_grad_placement=apply_grad_placement,
+        num_batch=num_microbatch)
 
     # Forcibly set the sharding specs of global invars and outvars.
     # FIXME(yonghao): the invar can appear on multiple meshes and thus different
@@ -245,7 +227,7 @@ def compile_pipeshard_executable_internal(
          output_sharding_dicts) = get_manual_input_output_sharding_specs(
              jax_all_stages, manual_stage_option.submesh_logical_shapes,
              parsed_manual_sharding_option, global_invars, global_outvars,
-             schedule.stage_mesh_mapping)
+             schedule.stage_mesh_mapping, fwd_intermediates)
     else:
         input_sharding_dicts = [input_sharding_dict] * num_meshes
         output_sharding_dicts = [output_sharding_dict] * num_meshes
@@ -353,7 +335,7 @@ def split_and_process_layers(closed_jaxpr, full_batch_closed_jaxpr,
 
 def get_manual_input_output_sharding_specs(stages, mesh_shapes, ms_option,
                                            global_invars, global_outvars,
-                                           stage_to_mesh):
+                                           stage_to_mesh, fwd_intermediates):
     """
     Split user assigned input and output PartitionSpec into sharding specs for
     each pipeline stage.
@@ -363,19 +345,33 @@ def get_manual_input_output_sharding_specs(stages, mesh_shapes, ms_option,
     var_to_pspec = {}
     handle_invar = False
     handle_outvar = False
+    # Add global input and output's parsed partition spec.
     if ms_option.in_parsed_pspec is not None:
         var_to_pspec.update(dict(zip(global_invars, ms_option.in_parsed_pspec)))
         handle_invar = True
     if ms_option.out_parsed_pspec is not None:
         var_to_pspec.update(
             dict(zip(global_outvars, ms_option.out_parsed_pspec)))
         handle_outvar = True
+    # Add pipeline intermediate's parsed partition spec.
+    intermediate_to_pspec = {}
+    if ms_option.pipeline_intermediate_axes is not None:
+        for v in fwd_intermediates:
+            # TODO: This is a simple heuristic: we simply replicate 1d tensors.
+            if len(v.aval.shape) <= 1:
+                continue
+            intermediate_to_pspec[v] = get_intermediate_parsed_spec(
+                ms_option.pipeline_intermediate_axes, len(v.aval.shape))
+
     submesh_axis_names = ms_option.submesh_axis_names
     if submesh_axis_names is None:
         submesh_axis_names = [ms_option.mesh_axis_names] * len(mesh_shapes)
 
     def get_vars_to_sharding_specs(variables, mesh_shape, mesh_axis_names):
-        parsed_specs = [var_to_pspec[v] for v in variables]
+        parsed_specs = [
+            (var_to_pspec[v] if v in var_to_pspec else intermediate_to_pspec[v])
+            for v in variables
+        ]
         avals = [v.aval for v in variables]
         var_op_shardings = parsed_spec_to_opsharding(parsed_specs, avals,
                                                      mesh_shape,
@@ -398,8 +394,13 @@ def get_vars_to_sharding_specs(variables, mesh_shape, mesh_axis_names):
         # invars
         if handle_invar:
             invar_in_global = [var for var in stage.invars if var in invar_set]
+            # add intermediate vars
+            intermediate_var = [
+                var for var in stage.invars if var in intermediate_to_pspec
+            ]
+            invars = invar_in_global + intermediate_var
             stage_invar_shardings = get_vars_to_sharding_specs(
-                invar_in_global, mesh_shape, mesh_axis_names)
+                invars, mesh_shape, mesh_axis_names)
         else:
             stage_invar_shardings = {}
         # outvars
@@ -458,13 +459,17 @@ def shard_each_stage(jax_all_stages, virtual_meshes, schedule, num_meshes,
         compile_intermediate = [None] * num_meshes
     total_flops = 0
     for mesh_idx in range(num_meshes):
-        input_sharding_dict = input_sharding_dicts[mesh_idx]
-        output_sharding_dict = output_sharding_dicts[mesh_idx]
         virtual_mesh = virtual_meshes[mesh_idx]
         logical_mesh = virtual_mesh.get_logical_mesh(
             logical_mesh_shapes[mesh_idx])
         autosharding_option = dataclasses.replace(
             default_as_option, **autosharding_option_dicts[mesh_idx])
+
+        # Predefined shardings. stage_input_sharding should have shardings for
+        # all parameters, while the sharding dict can have only a portion of
+        # all parameters.
+        input_sharding_dict = input_sharding_dicts[mesh_idx]
+        output_sharding_dict = output_sharding_dicts[mesh_idx]
         stage_input_sharding = stage_input_shardings[mesh_idx]
 
         # Setup dummy stages
diff --git a/alpa/pipeline_parallel/schedules.py b/alpa/pipeline_parallel/schedules.py
@@ -2,7 +2,7 @@
 import itertools
 import logging
 from abc import abstractmethod, ABCMeta
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 import numpy as np
 
@@ -15,23 +15,29 @@
 
 def gen_dependency_with_stages(
     compute_stages: List[PipelineComputation],
+    num_mesh: int,
     apply_grad_stages: List[PipelineComputation] = ()):
     """Generate the dependency matrix for a list of pipeline stages."""
     n_stages = len(compute_stages) + len(apply_grad_stages)
     d = np.zeros([n_stages, n_stages], dtype=int)
     var_stage_id = {}
+    fwd_intermediate_vars = OrderedSet()
     for i, stage in enumerate(itertools.chain(compute_stages,
                                               apply_grad_stages)):
         for var in stage.invars:
             if var in var_stage_id:
                 d[i, var_stage_id[var]] = 1
+                if i < num_mesh and var_stage_id[var] != 2 * num_mesh - i - 1:
+                    # not the var from forward to backward. we don't care them.
+                    # not the var on the backward side
+                    fwd_intermediate_vars.add(var)
             else:
                 # Assume the var is from global_invars
                 pass
         for var in stage.outvars:
             var_stage_id[var] = i
 
-    return d
+    return d, fwd_intermediate_vars
 
 
 def gen_linear_pipeline_dependency(num_stage):
@@ -510,3 +516,18 @@ def _generate_schedule(self):
             scheds[mesh_idx] = (self.last_backward_batch_index, stage_idx)
         schedules.append(scheds)
         return schedules
+
+
+pipeline_schedule: Dict[str, PipelineSchedule] = {}
+pipeline_schedule["gpipe"] = GpipeSchedule
+pipeline_schedule["1f1b"] = PipeDreamFlush
+pipeline_schedule["inference"] = InferenceSchedule
+pipeline_schedule["1f1b_overlap_friendly"] = OverlapFriendlyPipeDreamSchedule
+
+
+def create_pipeline_schedule(name, dependency, meshes, apply_grad_placement,
+                             num_batch):
+    return pipeline_schedule[name](dependency=dependency,
+                                   meshes=meshes,
+                                   apply_grad_placement=apply_grad_placement,
+                                   num_batch=num_batch)
diff --git a/alpa/shard_parallel/manual_sharding.py b/alpa/shard_parallel/manual_sharding.py
@@ -1,13 +1,14 @@
 """User specified manual sharding strategy following pjit's api."""
 import dataclasses
-from typing import Any, Optional, OrderedDict, Tuple, Union
+from typing import Any, Optional, OrderedDict, Sequence, Tuple, Union
 
 from jax._src.lib import xla_client as xc
 from jax._src.tree_util import _replace_nones
 from jax._src.util import safe_zip
 from jax.experimental.pjit import (_is_unspecified, _is_auto, _is_from_gda,
                                    _prepare_axis_resources, get_array_mapping,
-                                   _UNSPECIFIED, ParsedPartitionSpec)
+                                   _UNSPECIFIED, PartitionSpec,
+                                   ParsedPartitionSpec)
 from jax.interpreters import mlir, pxla
 from jax.tree_util import tree_unflatten, tree_flatten, tree_map
 
@@ -22,6 +23,12 @@ class ManualShardingOption:
     # According to pjit, None means replicated.
     in_axis_resources: Any = _UNSPECIFIED
     out_axis_resources: Any = _UNSPECIFIED
+    # To enable data parallel for multiple pipeline stages, where the input
+    # activation is not a global invar. Currently defined by (dim_name, dim_idx)
+    # TODO: a better design to allow only applying this rule to a subset of
+    # intermediate, because some pipeline communicated tensors do not have a
+    # batch dim. e.g. the time vector in diffusion generated at the first stage.
+    pipeline_intermediate_axes: Sequence[Tuple[str, int]] = None
 
 
 @dataclasses.dataclass
@@ -32,6 +39,7 @@ class ParsedManualShardingOption:
     # Parsed and flatten status
     in_parsed_pspec: Tuple[ParsedPartitionSpec, ...] = None
     out_parsed_pspec: Tuple[ParsedPartitionSpec, ...] = None
+    pipeline_intermediate_axes: Sequence[Tuple[str, int]] = None
 
 
 def _parsed_pspec_to_hlo_sharding(
@@ -121,9 +129,9 @@ def get_flatten_axis_resources(sharding_option: ManualShardingOption, in_tree,
     else:
         out_axis_flat = _prepare_axis_and_flatten(
             sharding_option.out_axis_resources, out_tree, "out_axis_resources")
-    return ParsedManualShardingOption(sharding_option.mesh_axis_names,
-                                      sharding_option.submesh_axis_names,
-                                      in_axis_flat, out_axis_flat)
+    return ParsedManualShardingOption(
+        sharding_option.mesh_axis_names, sharding_option.submesh_axis_names,
+        in_axis_flat, out_axis_flat, sharding_option.pipeline_intermediate_axes)
 
 
 def parsed_spec_to_opsharding(axes, avals, mesh_shape, mesh_axis_names):
@@ -156,3 +164,17 @@ def get_manual_sharding_spec(
         parsed_resources.out_parsed_pspec, out_avals, mesh_shape,
         mesh_axis_names)
     return in_op_shardings, out_op_shardings
+
+
+def get_intermediate_parsed_spec(intermediate_dims,
+                                 dim_len,
+                                 allow_unconstrained_dims=False):
+    axes = [None] * dim_len
+    for (name, dim) in intermediate_dims:
+        axes[dim] = name
+    pspec = PartitionSpec(*axes)
+    parsed_pspec = ParsedPartitionSpec.from_user_input(
+        pspec,
+        "intermediate specifications",
+        allow_unconstrained_dims=allow_unconstrained_dims)
+    return parsed_pspec
diff --git a/tests/pipeline_parallel/test_manual_sharding.py b/tests/pipeline_parallel/test_manual_sharding.py