Shortfin llm multi workers and multi fibers (#1280)

stbaione · web-flow · commit cb25984781c0 · 2025-04-21T11:41:21.000-05:00
Enable spinning up the server with multiple workers and multiple fibers per worker to facilitate higher parallelism and concurrency in prefill/decode invocations. ## TODO: - Fix `per_fiber` isolation - I'm also attempting to enable `per_fiber` program isolation. Currently, we've just been using `per_call`. As I understand, `per_fiber` may end up being faster. I'm getting a `std::bad_cast` error when attempting to create the input device_arrays in `get_args`. Still looking into this... Otherwise good to use with `per_call` isolation. - Right now, we're in a performance sprint, and get more benefit by focusing on fixing this another time. Created [an issue](#1284) for it, and will add a `NotImplemented` error in code for this case.
diff --git a/app_tests/integration_tests/llm/shortfin/direct_to_batcher_test.py b/app_tests/integration_tests/llm/shortfin/direct_to_batcher_test.py
@@ -38,7 +38,7 @@ class BatchConsistencyTestProcess(sf.Process):
     """
 
     def __init__(self, service, input_tokens, batch_sizes, max_response_length):
-        super().__init__(fiber=service.main_fiber)
+        super().__init__(fiber=service.fiber_pool.fibers[0])
         self.service = service
         self.input_tokens = input_tokens
         self.batch_sizes = batch_sizes
diff --git a/shortfin/python/shortfin_apps/llm/cli.py b/shortfin/python/shortfin_apps/llm/cli.py
@@ -36,6 +36,7 @@ def add_input_args(parser):
 
 
 def add_service_args(parser: argparse.ArgumentParser):
+    # TODO separate the server args from the `offline` args
     get_system_args(parser)
 
     parser.add_argument(
@@ -70,7 +71,7 @@ def add_service_args(parser: argparse.ArgumentParser):
         metavar="FILE",
     )
     parser.add_argument(
-        "--isolation",
+        "--program_isolation",
         type=str,
         default="per_call",
         choices=[isolation.name.lower() for isolation in ProgramIsolation],
@@ -112,11 +113,23 @@ def add_service_args(parser: argparse.ArgumentParser):
         required=False,
         help="Temperature value to use for `offline` generation.",
     )
+    parser.add_argument(
+        "--workers_offline",
+        type=int,
+        default=1,
+        help="Number of workers to use when running in `offline` mode.",
+    )
     parser.add_argument(
         "--workers",
         type=int,
         default=1,
-        help="Number of concurrent requests that should be running",
+        help="Number of workers to use when running in `server` mode.",
+    )
+    parser.add_argument(
+        "--fibers_per_worker",
+        type=int,
+        default=1,
+        help="Number of fibers to use per worker.",
     )
     parser.add_argument(
         "--benchmark",
@@ -243,10 +256,10 @@ async def worker(name, queue, fiber):
             task.result = responder.response.result()
             queue.task_done()
 
-    logger.info(msg=f"Setting up {args.workers} workers")
+    logger.info(msg=f"Setting up {args.workers_offline} workers")
     workers = []
     queue = asyncio.Queue()
-    for i in range(args.workers):
+    for i in range(args.workers_offline):
         name = f"worker-{i}"
         workerr = service.sysman.ls.create_worker(name)
         fiber = service.sysman.ls.create_fiber(workerr)
diff --git a/shortfin/python/shortfin_apps/llm/components/batcher.py b/shortfin/python/shortfin_apps/llm/components/batcher.py
@@ -6,7 +6,9 @@
 
 import logging
 import os
-from pathlib import Path
+
+from dataclasses import dataclass
+from typing import List
 
 
 import shortfin as sf
@@ -47,6 +49,22 @@ def __init__(self, count: int = 1):
         self.count = count
 
 
+@dataclass
+class FiberPool:
+
+    fibers: List[sf.Fiber]
+    idle_fibers: List[sf.Fiber]
+
+    def get_fiber(self):
+        if len(self.idle_fibers) == 0:
+            return None
+
+        return self.idle_fibers.pop(0)
+
+    def return_fiber(self, fiber: sf.Fiber):
+        self.idle_fibers.append(fiber)
+
+
 class LlmBatcherProcess(BatcherProcess):
     """This batcher provides a high-level mechanism for dispatching LLM tasks."""
 
@@ -56,13 +74,14 @@ class LlmBatcherProcess(BatcherProcess):
     def __init__(
         self,
         name: str,
-        fiber: Fiber,
+        fiber_pool: FiberPool,
         page_cache: BasePagedAttentionCache,
         model_params: ModelParams,
         functions: dict[int, sf.ProgramFunction],
         ideal_batch_size: int,
+        program_isolation: str,
     ):
-        super().__init__(fiber=fiber)
+        super().__init__(fiber=fiber_pool.fibers[0])
         self.name = name
         self.page_cache = page_cache
         self.model_params = model_params
@@ -74,6 +93,9 @@ def __init__(
         self.page_seq_stride = self.model_params.paged_kv_cache.block_seq_stride
         self._current_workitems = 0
 
+        self.fiber_pool = fiber_pool
+        self.program_isolation = program_isolation
+
     def handle_inference_request(self, request):
         """Handle an inference request."""
         self.pending.add(request)
@@ -115,25 +137,32 @@ async def board_flights(self):
             logger.info("Waiting a bit longer to fill flight")
             return
 
+        fiber = self.fiber_pool.get_fiber()
+        if fiber is None:
+            logger.info("Waiting for an idle fiber...")
+            return
+
         self.strobes = 0
         cache = self.page_cache
 
-        self.board(cache)
+        self.board(cache, fiber)
         logger.debug("Post boarding cache state: %r", cache)
+        if self.program_isolation != sf.ProgramIsolation.PER_FIBER:
+            self.fiber_pool.return_fiber(fiber)
 
-    def make_process(self, cache: BasePagedAttentionCache):
+    def make_process(self, cache: BasePagedAttentionCache, fiber: Fiber):
         ...
 
     def board_request(self, cache, request: LlmInferenceExecRequest):
         ...
 
-    def board(self, cache: BasePagedAttentionCache):
+    def board(self, cache: BasePagedAttentionCache, fiber: Fiber):
         # Fill prefill flights.
         pending = self.pending
         if len(pending) == 0:
             return
 
-        exec_process = self.make_process(cache)
+        exec_process = self.make_process(cache, fiber)
 
         for request in pending:
             if len(exec_process.exec_requests) >= self.ideal_batch_size:
@@ -164,26 +193,30 @@ class PrefillBatcherProcess(LlmBatcherProcess):
 
     def __init__(
         self,
-        fiber: Fiber,
+        fiber_pool: FiberPool,
         page_cache: BasePagedAttentionCache,
         model_params: ModelParams,
         prefill_functions: dict[int, sf.ProgramFunction],
+        program_isolation: str,
     ):
         super().__init__(
             name="prefill",
-            fiber=fiber,
+            fiber_pool=fiber_pool,
             page_cache=page_cache,
             model_params=model_params,
             functions=prefill_functions,
             ideal_batch_size=max(model_params.prefill_batch_sizes),
+            program_isolation=program_isolation,
         )
 
-    def make_process(self, cache: BasePagedAttentionCache):
+    def make_process(self, cache: BasePagedAttentionCache, fiber: Fiber):
         return PrefillExecutorProcess(
-            self.fiber,
+            fiber,
             self.functions,
             self.page_seq_stride,
             cache.page_pool.page_tables,
+            self.fiber_pool,
+            self.program_isolation,
         )
 
     def board_request(self, cache, request: LlmInferenceExecRequest):
@@ -216,26 +249,30 @@ class DecodeBatcherProcess(LlmBatcherProcess):
 
     def __init__(
         self,
-        fiber: Fiber,
+        fiber_pool: FiberPool,
         page_cache: BasePagedAttentionCache,
         model_params: ModelParams,
         decode_functions: dict[int, sf.ProgramFunction],
+        program_isolation: str,
     ):
         super().__init__(
             name="decode",
-            fiber=fiber,
+            fiber_pool=fiber_pool,
             page_cache=page_cache,
             model_params=model_params,
             functions=decode_functions,
             ideal_batch_size=max(model_params.decode_batch_sizes),
+            program_isolation=program_isolation,
         )
 
-    def make_process(self, cache: BasePagedAttentionCache):
+    def make_process(self, cache: BasePagedAttentionCache, fiber: Fiber):
         return DecodeExecutorProcess(
-            self.fiber,
+            fiber,
             self.functions,
             self.page_seq_stride,
             cache.page_pool.page_tables,
+            self.fiber_pool,
+            self.program_isolation,
         )
 
     def board_request(self, cache, request: LlmInferenceExecRequest):
@@ -260,13 +297,17 @@ def __init__(
         functions: dict[int, sf.ProgramFunction],
         seq_stride: int,
         page_tables,
+        fiber_pool: FiberPool,
+        program_isolation: sf.ProgramIsolation,
     ):
         super().__init__(fiber=fiber)
         self.name = name
         self.seq_stride = seq_stride
         self.exec_requests: list[LlmInferenceExecRequest] = []
         self.page_tables = page_tables
         self.functions = functions
+        self.fiber_pool = fiber_pool
+        self.program_isolation = program_isolation
 
     async def get_args(self, bs, device0):
         ...
@@ -345,13 +386,17 @@ def __init__(
         functions: dict[int, sf.ProgramFunction],
         seq_stride: int,
         page_tables,
+        fiber_pool: FiberPool,
+        program_isolation: sf.ProgramIsolation,
     ):
         super().__init__(
             name="prefill_process",
             fiber=fiber,
             functions=functions,
             seq_stride=seq_stride,
             page_tables=page_tables,
+            fiber_pool=fiber_pool,
+            program_isolation=program_isolation,
         )
 
     async def get_args(self, bs, device0):
@@ -432,6 +477,9 @@ async def get_results(self, logits, req_count, device0):
                 req.result_logits = logits_item
             req.done.set_success()
 
+        if self.program_isolation == sf.ProgramIsolation.PER_FIBER:
+            self.fiber_pool.return_fiber(self.fiber)
+
 
 class DecodeExecutorProcess(LlmExecutorProcess):
     """Executes a decode batch."""
@@ -442,13 +490,17 @@ def __init__(
         functions: dict[int, sf.ProgramFunction],
         seq_stride: int,
         page_tables,
+        fiber_pool: FiberPool,
+        isolation: sf.ProgramIsolation,
     ):
         super().__init__(
             name="decode_process",
             fiber=fiber,
             functions=functions,
             seq_stride=seq_stride,
             page_tables=page_tables,
+            fiber_pool=fiber_pool,
+            program_isolation=isolation,
         )
 
     async def get_args(self, bs, device0):
@@ -545,3 +597,6 @@ async def get_results(self, logits, req_count, device0):
             else:
                 req.result_logits = logits_item
             req.done.set_success()
+
+        if self.program_isolation == sf.ProgramIsolation.PER_FIBER:
+            self.fiber_pool.return_fiber(self.fiber)
diff --git a/shortfin/python/shortfin_apps/llm/components/config_struct.py b/shortfin/python/shortfin_apps/llm/components/config_struct.py
@@ -212,6 +212,12 @@ class ServerParams:
     # Program isolation configuration
     program_isolation: str = "per_call"
 
+    # Number of shortfin workers to use during generation
+    workers: int = 1
+
+    # Number of fibers to create per worker
+    fibers_per_worker: int = 1
+
     decode_config: DecodeConfig | None = None
 
     # Device configuration
diff --git a/shortfin/python/shortfin_apps/llm/components/generate.py b/shortfin/python/shortfin_apps/llm/components/generate.py
@@ -142,7 +142,7 @@ def __init__(
         responder: FastAPIResponder,
         fiber: sf.Fiber | None = None,
     ):
-        super().__init__(fiber=service.main_fiber if fiber is None else fiber)
+        super().__init__(fiber=service.fiber_pool.fibers[0] if fiber is None else fiber)
         self.service = service
         self.gen_req = gen_req
         self.responder = responder
diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py
diff --git a/shortfin/python/shortfin_apps/llm/server.py b/shortfin/python/shortfin_apps/llm/server.py