use prompt parameter for dist generation

mreso · mreso · commit 3836928671f6 · 2024-10-23T16:42:55.000-07:00
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -58,8 +58,6 @@ class BuilderArgs:
     precision: torch.dtype = torch.float32
     setup_caches: bool = False
     distributed: bool = False
-    num_gpus: int = 1
-    num_nodes: int = 1
     pp: int = 1
     tp: int = 1
     chpt_from: str = "hf"
@@ -165,8 +163,6 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             dtype = name_to_dtype(args.dtype, args.device)
         # distributed args
         distributed = getattr(args, "distributed", False)
-        num_gpus = getattr(args, "num_gpus", 1)
-        num_nodes = getattr(args, "num_nodes", 1)
         pp = getattr(args, "pp", 1)
         tp = getattr(args, "tp", 1)
         chpt_from = getattr(args, "chpt_from", "hf")
@@ -184,8 +180,6 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             precision=dtype,
             setup_caches=(output_dso_path or output_pte_path),
             distributed=distributed,
-            num_gpus=num_gpus,
-            num_nodes=num_nodes,
             pp=pp,
             tp=tp,
             chpt_from=chpt_from,
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
@@ -448,13 +448,13 @@ def _add_custom_model_args(parser) -> None:
         "--params-path",
         type=Path,
         default=None,
-        help= "Use the specified parameter file, instead of one specified under torchchat.model_params",
+        help="Use the specified parameter file, instead of one specified under torchchat.model_params",
     )
     parser.add_argument(
         "--tokenizer-path",
         type=Path,
         default=None,
-        help= "Use the specified model tokenizer file, instead of the one downloaded from HuggingFace",
+        help="Use the specified model tokenizer file, instead of the one downloaded from HuggingFace",
     )
 
 
diff --git a/torchchat/distributed/generate.py b/torchchat/distributed/generate.py
@@ -3,25 +3,25 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+import asyncio
+import atexit
+import importlib.util
+import subprocess
+import threading
 from abc import abstractmethod
 from collections import deque
 from dataclasses import dataclass
 from functools import partial
 from os import environ
 from pathlib import Path
-from torchchat.cli.builder import BuilderArgs, TokenizerArgs
 from typing import List, Optional
 from uuid import uuid4
 
-import asyncio
-import atexit
 import torch.multiprocessing as mp
-import threading
-import importlib.util
-import subprocess
+from torchchat.cli.builder import BuilderArgs, TokenizerArgs
 
 
-def _setup_env(world_size:int, rank:int, target: callable, *args, **kwargs):
+def _setup_env(world_size: int, rank: int, target: callable, *args, **kwargs):
     environ["MASTER_ADDR"] = "localhost"
     environ["MASTER_PORT"] = "29500"
     environ["RDZV_BACKEND"] = "c10d"
@@ -36,10 +36,11 @@ def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
     # create programmatic elastic launch
     print("Launching distributed inference ...")
 
-    num_processes_per_node = 4  # builder_args.num_gpus + 1
+    num_processes_per_node = builder_args.pp * builder_args.tp
 
     from torchchat.distributed.dist_run import main
-    mp.set_start_method('spawn')
+
+    mp.set_start_method("spawn")
 
     pipes = []
     procs = []
@@ -48,25 +49,24 @@ def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
         pipes.append(server_pipe)
         proc = mp.Process(
             target=partial(_setup_env, num_processes_per_node, rank, main),
-            args=(builder_args, client_pipe)
+            args=(builder_args, client_pipe),
         )
         proc.start()
 
-
     for pipe in pipes:
         response = pipe.recv()
 
-    print(
-        f"Done launching distributed inference on **4 ** {builder_args.num_gpus} GPUs."
-    )
+    print(f"Done launching distributed inference on {num_processes_per_node} GPUs.")
     return procs, pipes
 
+
 @dataclass
 class Output:
     is_finished: bool = False
     text: Optional[str] = None
     token: Optional[list] = None
 
+
 @dataclass
 class Request:
     request_id: int
@@ -84,7 +84,7 @@ def __init__(
         generator_args,
         pipes,
         loop,
-        ):
+    ):
         self.builder_args = builder_args
         self.generator_args = generator_args
         self.requests = {}
@@ -107,7 +107,7 @@ def process_requests_loop(self):
             if req == "stop":
                 break
             self.requests = {req.request_id: req.prompt}
-            
+
             responses = {}
             running = True
             while running:
@@ -128,17 +128,17 @@ async def wait_for_request(self, req: Request) -> Output:
                 yield output
         del self.req_to_states[req.request_id]
         del self.req_to_results[req.request_id]
-    
+
     def step(self) -> List[Output]:
         responses = []
-        #TODO: Implement a scheduler to handle the requests
+        # TODO: Implement a scheduler to handle the requests
         if len(self.in_flight_requests) > 0:
-            #Receive decoded token
+            # Receive decoded token
             for p in self.pipes:
                 p.send("step")
             for p in self.pipes:
                 responses.append(p.recv())
-            
+
         else:
             # Send requests to backend
             self.in_flight_batch_order = list(self.requests.keys())
@@ -148,25 +148,26 @@ def step(self) -> List[Output]:
             self.in_flight_requests = self.requests
             self.requests = {}
             self.current_step = 0
-            #Receive first token
+            # Receive first token
             for p in self.pipes:
                 responses.append(p.recv())
-        responses = responses[0]
+        # Filter out None responses from in-between stages
+        responses = [r for r in responses if r is not None][0]
         outputs = []
         for k, v in zip(self.in_flight_batch_order, zip(responses[0], responses[1])):
             text, token_ids = v
             outputs.append(
                 Output(
-                    is_finished=self.current_step>=self.generator_args.max_new_tokens,
+                    is_finished=self.current_step >= self.generator_args.max_new_tokens,
                     text=text,
                     token=token_ids,
-                    )
                 )
+            )
         if self.current_step >= self.generator_args.max_new_tokens:
             for p in self.pipes:
                 p.send("stop")
             self.in_flight_requests = []
-        
+
         self.current_step += 1
 
         return outputs
@@ -177,24 +178,28 @@ def __init__(
         self,
         builder_args: BuilderArgs,
         tokenizer_args: TokenizerArgs,
-        #TODO: move GeneratorArgs into a different module
+        # TODO: move GeneratorArgs into a different module
         generator_args,
         profile: Optional[Path],
         quantize: bool,
         draft_quantize: bool,
-        ):
+    ):
         self.builder_args = builder_args
         self.generate_args = generator_args
-        
+
+        self.check_args()
+
         self.procs, self.pipes = _launch_distributed_inference(builder_args)
 
         self.loop = asyncio.new_event_loop()
         asyncio.set_event_loop(self.loop)
 
         self.scheduler = Scheduler(builder_args, generator_args, self.pipes, self.loop)
 
-        #TODO: Mode into process and use pipe or queue for comm
-        self.scheduler_thread = threading.Thread(target=self.scheduler.process_requests_loop)
+        # TODO: Mode into process and use pipe or queue for comm
+        self.scheduler_thread = threading.Thread(
+            target=self.scheduler.process_requests_loop
+        )
         self.scheduler_thread.start()
 
         atexit.register(self.shutdown)
@@ -220,3 +225,9 @@ def generate(self, text):
             running &= not output.is_finished
 
             yield output
+
+    def check_args(self):
+        if self.generate_args.chat_mode:
+            raise NotImplementedError(
+                "Currently we only support generate with --distributed"
+            )
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -1243,7 +1243,7 @@ def main(args):
         )
 
         response = ""
-        for output in dist_gen.generate("Tell me a joke"):
+        for output in dist_gen.generate(generator_args.prompt):
             response += output.text
 
         print(f"Model output: {response}")

Original file line number	Diff line number	Diff line change
`@@ -448,13 +448,13 @@ def _add_custom_model_args(parser) -> None:`
`448`	`448`	`"--params-path",`
`449`	`449`	`type=Path,`
`450`	`450`	`default=None,`
`451`		`- help= "Use the specified parameter file, instead of one specified under torchchat.model_params",`
	`451`	`+ help="Use the specified parameter file, instead of one specified under torchchat.model_params",`
`452`	`452`	`)`
`453`	`453`	`parser.add_argument(`
`454`	`454`	`"--tokenizer-path",`
`455`	`455`	`type=Path,`
`456`	`456`	`default=None,`
`457`		`- help= "Use the specified model tokenizer file, instead of the one downloaded from HuggingFace",`
	`457`	`+ help="Use the specified model tokenizer file, instead of the one downloaded from HuggingFace",`
`458`	`458`	`)`
`459`	`459`
`460`	`460`
Original file line number	Diff line number	Diff line change
`@@ -1243,7 +1243,7 @@ def main(args):`
`1243`	`1243`	`)`
`1244`	`1244`
`1245`	`1245`	`response = ""`
`1246`		`- for output in dist_gen.generate("Tell me a joke"):`
	`1246`	`+ for output in dist_gen.generate(generator_args.prompt):`
`1247`	`1247`	`response += output.text`
`1248`	`1248`
`1249`	`1249`	`print(f"Model output: {response}")`