[WIP] Move dist inf into its own generator

mreso · mreso · commit 1faa0528bcf5 · 2024-10-18T14:13:46.000-07:00
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -508,72 +508,7 @@ def _load_model(builder_args: BuilderArgs) -> Model:
 
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
-
-
-import importlib.util
-import subprocess
-
-
-def run_script(script_path, *args):
-    # Construct the command to run the script
-    cmd = [sys.executable, script_path] + list(args)
-
-    # Run the script as a subprocess
-    process = subprocess.Popen(
-        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
-    )
-
-    # Stream the output in real-time
-    for line in process.stdout:
-        print(line, end="")
-    for line in process.stderr:
-        print(line, end="", file=sys.stderr)
-
-    # Wait for the process to complete and get the return code
-    return_code = process.wait()
-    if return_code != 0:
-        raise subprocess.CalledProcessError(return_code, cmd)
-
-
-def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
-    # create programmatic elastic launch
-    print("Launching distributed inference ...")
-
-    num_processes_per_node = 4  # builder_args.num_gpus + 1
-
-    lc = launcher.LaunchConfig(
-        min_nodes=1,
-        max_nodes=1,
-        nproc_per_node=num_processes_per_node,
-        # run_id=str(uuid.uuid4()),
-        rdzv_backend="c10d",
-        rdzv_endpoint="localhost:29401",
-        max_restarts=0,
-        monitor_interval=1,
-    )
-
-    train_file_path = Path(__file__).parent.parent.parent / "dist_run.py"
-    print(f"train_file_path: {train_file_path}")
-    # import argparse
-
-    # parser2 = argparse.ArgumentParser()
-
-    # args = parser2.parse_args()
-    args = []
-    print(f"args: {args}")
-
-    elastic_launch(
-        config=lc,
-        entrypoint=run_script,
-    )(train_file_path, *args)
-    print(
-        f"Done launching distributed inference on **4 ** {builder_args.num_gpus} GPUs."
-    )
-    #  role=role, *args, **kwargs)
-
-    # assert False, "distributed inference is not supported yet"
-    # pass
-
+    
 
 def _initialize_model(
     builder_args: BuilderArgs,
@@ -583,11 +518,6 @@ def _initialize_model(
     support_tensor_subclass: bool = True,
 ) -> Model:
     print("Loading model...")
-    if builder_args.distributed:
-        # we part ways here with torchchat cli and move into dist inference
-        _launch_distributed_inference(builder_args)
-        return None
-
     if builder_args.gguf_path and (builder_args.dso_path or builder_args.pte_path):
         print("Setting gguf_kwargs for generate.")
         is_dso = builder_args.dso_path is not None
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
@@ -409,6 +409,22 @@ def _add_distributed_args(parser) -> None:
         help=argparse.SUPPRESS,
         # "Use the specified model checkpoint directory",
     )
+    parser.add_argument(
+        "--pp",
+        "--pipeline-parallel",
+        type=int,
+        default=1,
+        help=argparse.SUPPRESS,
+        # "Pipeline parallel degree",
+    )
+    parser.add_argument(
+        "--tp",
+        "--tensor-parallel",
+        type=int,
+        default=1,
+        help=argparse.SUPPRESS,
+        # "Tensor parallel degree",
+    )
 
 
 # Add CLI Args related to custom model inputs
diff --git a/torchchat/distributed/dist_run.py b/torchchat/distributed/dist_run.py
diff --git a/torchchat/distributed/generate.py b/torchchat/distributed/generate.py
@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import abstractmethod
+from typing import List, Optional
+from dataclasses import dataclass
+from pathlib import Path
+from torchchat.cli.builder import BuilderArgs, TokenizerArgs
+
+
+import importlib.util
+import subprocess
+
+
+def run_script(script_path, *args):
+    # Construct the command to run the script
+    cmd = [sys.executable, script_path] + list(args)
+
+    # Run the script as a subprocess
+    process = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
+    )
+
+    # Stream the output in real-time
+    for line in process.stdout:
+        print(line, end="")
+    for line in process.stderr:
+        print(line, end="", file=sys.stderr)
+
+    # Wait for the process to complete and get the return code
+    return_code = process.wait()
+    if return_code != 0:
+        raise subprocess.CalledProcessError(return_code, cmd)
+
+
+def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
+    # create programmatic elastic launch
+    print("Launching distributed inference ...")
+
+    num_processes_per_node = 4  # builder_args.num_gpus + 1
+
+    lc = launcher.LaunchConfig(
+        min_nodes=1,
+        max_nodes=1,
+        nproc_per_node=num_processes_per_node,
+        # run_id=str(uuid.uuid4()),
+        rdzv_backend="c10d",
+        rdzv_endpoint="localhost:29401",
+        max_restarts=0,
+        monitor_interval=1,
+    )
+
+    # train_file_path = Path(__file__).parent.parent.parent / "dist_run.py"
+    # print(f"train_file_path: {train_file_path}")
+    # import argparse
+
+    # parser2 = argparse.ArgumentParser()
+
+    # args = parser2.parse_args()
+    args = []
+    print(f"args: {args}")
+
+    from dist_run import main
+
+    elastic_launch(
+        config=lc,
+        entrypoint=run_script,
+    )(main, *args)
+    print(
+        f"Done launching distributed inference on **4 ** {builder_args.num_gpus} GPUs."
+    )
+    #  role=role, *args, **kwargs)
+
+    # assert False, "distributed inference is not supported yet"
+    # pass
+
+@dataclass
+class Output:
+    request_id: int
+    is_finished: bool = False
+    output: Optional[str] = None
+
+class Generator(object):
+
+    @abstractmethod
+    def add_request(self, request_id: int, prompt: str):
+        raise NotImplementedError()
+
+    def step(self) -> List[Output]:
+        raise NotImplementedError()
+
+
+class DistributedGenerator(Generator):
+    def __init__(
+        self,
+        builder_args: BuilderArgs,
+        speculative_builder_args: BuilderArgs,
+        tokenizer_args: TokenizerArgs,
+        #TODO: move GeneratorArgs into a different module
+        # generator_args: GeneratorArgs,
+        profile: Optional[Path],
+        quantize: bool,
+        draft_quantize: bool,
+        ):
+        self.requests = {}
+        # if builder_args.distributed:
+        # # we part ways here with torchchat cli and move into dist inference
+        _launch_distributed_inference(builder_args)
+        # return None
+
+
+    def add_request(self, request_id: int, prompt: str):
+        assert request_id not in self.requests
+        self.requests[request_id] = prompt
+
+
+    def step(self) -> List[Output]:
+        outputs = []
+        for request_id, prompt in self.requests.items():
+            outputs.append(Output(request_id, is_finished=True, output=prompt))
+        
+        for output in outputs:
+            if output.is_finished:
+                del self.requests[output.request_id]
+
+        return outputs
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -31,6 +31,7 @@
     TokenizerArgs,
 )
 from torchchat.model import Model, ModelType
+from torchchat.distributed.generate import DistributedGenerator
 from torchchat.utils.build_utils import device_sync, set_precision
 from torchchat.utils.device_info import get_device_info
 
@@ -1215,19 +1216,37 @@ def main(args):
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
     generator_args = GeneratorArgs.from_args(args)
-    gen = Generator(
-        builder_args,
-        speculative_builder_args,
-        tokenizer_args,
-        generator_args,
-        args.profile,
-        args.quantize,
-        args.draft_quantize,
-    )
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    if builder_args.distributed:
+    if not builder_args.distributed:
+        gen = Generator(
+            builder_args,
+            speculative_builder_args,
+            tokenizer_args,
+            generator_args,
+            args.profile,
+            args.quantize,
+            args.draft_quantize,
+        )
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+        
+
+        for _ in gen.chat(generator_args):
+            pass
+    else:
+        dist_gen = DistributedGenerator(
+            builder_args,
+            speculative_builder_args,
+            tokenizer_args,
+            # generator_args,
+            args.profile,
+            args.quantize,
+            args.draft_quantize,
+        )
+
+        dist_gen.add_request(0, "Tell me a joke")
+        dist_gen.add_request(1, "Tell me another joke")
 
-        return
-    for _ in gen.chat(generator_args):
-        pass
+        outputs = dist_gen.step()
+        while len(outputs):
+            print(outputs)
+            outputs = dist_gen.step()