Added generate method and placeholder scheduler

mreso · mreso · commit adcf232ba9b8 · 2024-10-22T21:26:35.000-07:00
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -63,7 +63,6 @@ class BuilderArgs:
     pp: int = 1
     tp: int = 1
     chpt_from: str = "hf"
-    ntokens: int = 40
     is_chat_model: bool = False
     prefill_possible: bool = False
     dynamic_shapes: bool = False
@@ -171,7 +170,6 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         pp = getattr(args, "pp", 1)
         tp = getattr(args, "tp", 1)
         chpt_from = getattr(args, "chpt_from", "hf")
-        ntokens = getattr(args, "ntokens", 40)
         return cls(
             checkpoint_dir=checkpoint_dir,
             checkpoint_path=checkpoint_path,
@@ -191,7 +189,6 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             pp=pp,
             tp=tp,
             chpt_from=chpt_from,
-            ntokens=ntokens,
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
             max_seq_length=getattr(args, "max_seq_length", None),
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
@@ -425,13 +425,6 @@ def _add_distributed_args(parser) -> None:
         help=argparse.SUPPRESS,
         # "Tensor parallel degree",
     )
-
-    parser.add_argument(
-        "--ntokens",
-        type=int,
-        default=40,
-        help="Number of tokens to generate",
-    )
     parser.add_argument(
         "--chpt-from",
         type=str,
diff --git a/torchchat/distributed/dist_run.py b/torchchat/distributed/dist_run.py
@@ -388,7 +388,7 @@ def main(args, pipe):
     # Batch size. Since we push batches dynamically through the pipeline rather
     # than chunking them, this is effectively micro-batch size in pipeline
     # sense. Thus it is interchangeable with micro-batch size below.
-    batch_size = 4# len(prompt)
+    batch_size = 1# len(prompt)
     seqlen_prefill = 1024  # sequence length
     dim = 4096  # embedding dimension
 
@@ -410,9 +410,6 @@ def main(args, pipe):
     logger.info(
         f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}"
     )
-
-    # Setup input position (input_pos) for prefill: a list of increasing integers from 0 to seqlen
-    input_pos = torch.arange(seqlen_prefill, device=device)
     model.eval()
 
     # Helper function to get example inputs and outputs for the stages.
@@ -470,6 +467,8 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
             logger.info(f"{color.green}Prompt: {prompt}{color.reset}")
 
             start_pos = 0
+            # Setup input position (input_pos) for prefill: a list of increasing integers from 0 to seqlen
+            input_pos = torch.arange(seqlen_prefill, device=device)
 
         # encode the prompt
         input_ids = _encode_strings(
@@ -511,9 +510,8 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
             res.append(new_token)
             #TODO: Move to a separate decoding thread
             resp = _decode_in_flight(new_token, tokenizer, tp_rank)
-            pipe.send(resp)
+            pipe.send((resp, new_token.tolist()))
         else:
-            logger.info(f"sending None {tp_rank=}")
             pipe.send(None)
 
         # seqlen = 1 now
@@ -577,7 +575,7 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
                     res.append(new_token)
                     #TODO: Move to a separate decoding thread
                     resp = _decode_in_flight(new_token, tokenizer, tp_rank)
-                    pipe.send(resp)
+                    pipe.send((resp, new_token))
                 else:
                     pipe.send(None)
 
diff --git a/torchchat/distributed/generate.py b/torchchat/distributed/generate.py
@@ -4,15 +4,19 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 from abc import abstractmethod
-from typing import List, Optional
+from collections import deque
 from dataclasses import dataclass
-from pathlib import Path
+from functools import partial
 from os import environ
+from pathlib import Path
 from torchchat.cli.builder import BuilderArgs, TokenizerArgs
-from functools import partial
+from typing import List, Optional
+from uuid import uuid4
 
+import asyncio
 import atexit
 import torch.multiprocessing as mp
+import threading
 import importlib.util
 import subprocess
 
@@ -51,7 +55,6 @@ def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
 
     for pipe in pipes:
         response = pipe.recv()
-        print(f"Received: {response=}")
 
     print(
         f"Done launching distributed inference on **4 ** {builder_args.num_gpus} GPUs."
@@ -60,56 +63,72 @@ def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
 
 @dataclass
 class Output:
-    request_id: int
     is_finished: bool = False
-    output: Optional[str] = None
-
-class Generator(object):
+    text: Optional[str] = None
+    token: Optional[list] = None
 
-    @abstractmethod
-    def add_request(self, request_id: int, prompt: str):
-        raise NotImplementedError()
+@dataclass
+class Request:
+    request_id: int
+    prompt: str
 
-    def step(self) -> List[Output]:
-        raise NotImplementedError()
+    @classmethod
+    def new_request(cls, prompt):
+        return cls(request_id=uuid4().int, prompt=prompt)
 
 
-class DistributedGenerator(Generator):
+class Scheduler(object):
     def __init__(
         self,
-        builder_args: BuilderArgs,
-        speculative_builder_args: BuilderArgs,
-        tokenizer_args: TokenizerArgs,
-        #TODO: move GeneratorArgs into a different module
-        # generator_args: GeneratorArgs,
-        profile: Optional[Path],
-        quantize: bool,
-        draft_quantize: bool,
+        builder_args,
+        generator_args,
+        pipes,
+        loop,
         ):
         self.builder_args = builder_args
+        self.generator_args = generator_args
         self.requests = {}
         self.in_flight_requests = {}
-        # For now we have a static batch order we save separately
         self.in_flight_batch_order = []
-        # if builder_args.distributed:
-        # # we part ways here with torchchat cli and move into dist inference
-        self.procs, self.pipes = _launch_distributed_inference(builder_args)
-        self.current_step = 0
-
-        atexit.register(self.shutdown)
-
-    def shutdown(self):
-        for p in self.pipes:
-                p.send("stop")
-        for p in self.procs:
-            p.kill()
-
-    #TODO: Replace against (async) generate
-    def add_request(self, request_id: int, prompt: str):
-        assert request_id not in self.requests
-        self.requests[request_id] = prompt
-
-
+        self.pipes = pipes
+        self.req_to_states = {}
+        self.req_to_results = {}
+        self.request_queue = mp.Queue()
+        self.loop = loop
+
+    def schedule_request(self, req: Request):
+        self.req_to_states[req.request_id] = asyncio.Event()
+        self.req_to_results[req.request_id] = deque()
+        self.request_queue.put(req)
+
+    def process_requests_loop(self):
+        while True:
+            req = self.request_queue.get()
+            if req == "stop":
+                break
+            self.requests = {req.request_id: req.prompt}
+            
+            responses = {}
+            running = True
+            while running:
+                outputs = self.step()
+                self.req_to_results[req.request_id].append(outputs[0])
+
+                self.loop.call_soon_threadsafe(self.req_to_states[req.request_id].set)
+
+                running &= not outputs[0].is_finished
+
+    async def wait_for_request(self, req: Request) -> Output:
+        is_finished = False
+        while not is_finished:
+            await self.req_to_states[req.request_id].wait()
+            while len(self.req_to_results[req.request_id]):
+                output = self.req_to_results[req.request_id].popleft()
+                is_finished |= output.is_finished
+                yield output
+        del self.req_to_states[req.request_id]
+        del self.req_to_results[req.request_id]
+    
     def step(self) -> List[Output]:
         responses = []
         #TODO: Implement a scheduler to handle the requests
@@ -132,12 +151,72 @@ def step(self) -> List[Output]:
             #Receive first token
             for p in self.pipes:
                 responses.append(p.recv())
-
         responses = responses[0]
         outputs = []
-        for k, v in zip(self.in_flight_batch_order, responses):
-            outputs.append(Output(k, is_finished=self.current_step>=self.builder_args.ntokens, output=v))
+        for k, v in zip(self.in_flight_batch_order, zip(responses[0], responses[1])):
+            text, token_ids = v
+            outputs.append(
+                Output(
+                    is_finished=self.current_step>=self.generator_args.max_new_tokens,
+                    text=text,
+                    token=token_ids,
+                    )
+                )
+        if self.current_step >= self.generator_args.max_new_tokens:
+            for p in self.pipes:
+                p.send("stop")
+            self.in_flight_requests = []
         
         self.current_step += 1
 
         return outputs
+
+
+class DistributedGenerator(object):
+    def __init__(
+        self,
+        builder_args: BuilderArgs,
+        tokenizer_args: TokenizerArgs,
+        #TODO: move GeneratorArgs into a different module
+        generator_args,
+        profile: Optional[Path],
+        quantize: bool,
+        draft_quantize: bool,
+        ):
+        self.builder_args = builder_args
+        self.generate_args = generator_args
+        
+        self.procs, self.pipes = _launch_distributed_inference(builder_args)
+
+        self.loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(self.loop)
+
+        self.scheduler = Scheduler(builder_args, generator_args, self.pipes, self.loop)
+
+        #TODO: Mode into process and use pipe or queue for comm
+        self.scheduler_thread = threading.Thread(target=self.scheduler.process_requests_loop)
+        self.scheduler_thread.start()
+
+        atexit.register(self.shutdown)
+
+    def shutdown(self):
+        self.scheduler.request_queue.put("stop")
+        self.scheduler_thread.join()
+
+        for p in self.pipes:
+            p.send("stop")
+        for p in self.procs:
+            p.kill()
+
+    def generate(self, text):
+        req = Request.new_request(text)
+        self.scheduler.schedule_request(req)
+
+        generator = self.scheduler.wait_for_request(req)
+
+        running = True
+        while running:
+            output = self.loop.run_until_complete(generator.__anext__())
+            running &= not output.is_finished
+
+            yield output
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -1235,30 +1235,18 @@ def main(args):
     else:
         dist_gen = DistributedGenerator(
             builder_args,
-            speculative_builder_args,
             tokenizer_args,
-            # generator_args,
+            generator_args,
             args.profile,
             args.quantize,
             args.draft_quantize,
         )
 
-        dist_gen.add_request(0, "Tell me a joke")
-        dist_gen.add_request(1, "Tell me another joke")
-        dist_gen.add_request(2, "Who is this Santa")
-        dist_gen.add_request(3, "What did the fish say to the duck")
-
-        responses = {}
+        response = ""
+        for output in dist_gen.generate("Tell me a joke"):
+            response += output.text
 
-        running = True
-        while running:
-            outputs = dist_gen.step()
-            for o in outputs:
-                responses[o.request_id] = responses.get(o.request_id, "") +  o.output
-                running &= not o.is_finished
-
-        print(responses)
-        
+        print(f"Model output: {response}")
         dist_gen.shutdown()