refactor: dispatcher as forwarding decision maker for llm (#305)

myungjin · web-flow · commit e6d28c883c3c · 2025-10-29T09:35:34.000-07:00
llm serving is stateful. when a request is served in a distributed manner,
it needs to be routed to the same set of workers for efficient decoding.
In particular, when a token output needs to go back to the first layer,
it has to go back to the worker that holds KV cache. Otherwise, decoding
may be done incorrectly if not it's broken. However, when a serving pipeline
is constructed as a mesh, the current implementation doesn't guarantee
a correct forwarding. To allow correct forwarding, we make a server
(dispatcher) work as a forwarding decision maker for llm. Since there is
only one dispatcher, the workers of the last stage can deterministically
forward a generated token back to the dispatcher. Then, the dispatcher
determines whether the token needs to be sent back to the first stage or
not.
diff --git a/examples/llama3/auto/linear-no-recover.yaml b/examples/llama3/auto/linear-no-recover.yaml
@@ -19,8 +19,6 @@ flow_graph:
   0-0:
     - name: w1
       peers: [s-0]
-    - name: w2
-      peers: [2-0]
   1-0:
     - name: w3
       peers: [0-0]
diff --git a/examples/llama3/auto/linear.yaml b/examples/llama3/auto/linear.yaml
@@ -18,8 +18,6 @@ flow_graph:
   0-0:
     - name: w1
       peers: [s-0]
-    - name: w2
-      peers: [2-0]
   1-0:
     - name: w3
       peers: [0-0]
diff --git a/examples/llama3/auto/linear_p3.8xlarge.yaml b/examples/llama3/auto/linear_p3.8xlarge.yaml
@@ -18,8 +18,6 @@ flow_graph:
   0-0:
     - name: w1
       peers: [s-0]
-    - name: w2
-      peers: [2-0]
   1-0:
     - name: w3
       peers: [0-0]
diff --git a/examples/llama3/static/linear-no-recover.yaml b/examples/llama3/static/linear-no-recover.yaml
@@ -24,10 +24,6 @@ flow_graph:
       peers: [s-0]
       addr: 10.20.1.50
       backend: gloo
-    - name: w2
-      peers: [2-0]
-      addr: 10.20.1.50
-      backend: gloo
   1-0:
     - name: w3
       peers: [0-0]
diff --git a/examples/llama3/static/linear.yaml b/examples/llama3/static/linear.yaml
@@ -23,10 +23,6 @@ flow_graph:
       peers: [s-0]
       addr: 10.20.1.50
       backend: gloo
-    - name: w2
-      peers: [2-0]
-      addr: 10.20.1.50
-      backend: gloo
   1-0:
     - name: w3
       peers: [0-0]
diff --git a/examples/llama3/static/linear_p3.8xlarge.yaml b/examples/llama3/static/linear_p3.8xlarge.yaml
@@ -16,26 +16,22 @@ flow_graph:
   s-0:
     - name: w0
       peers: [2-0]
-      addr: 10.20.1.50
+      addr: 10.20.1.72
       backend: nccl
   0-0:
     - name: w1
       peers: [s-0]
-      addr: 10.20.1.50
-      backend: nccl
-    - name: w2
-      peers: [2-0]
-      addr: 10.20.1.50
+      addr: 10.20.1.72
       backend: nccl
   1-0:
     - name: w3
       peers: [0-0]
-      addr: 10.20.1.50
+      addr: 10.20.1.72
       backend: nccl
   2-0:
     - name: w4
       peers: [1-0]
-      addr: 10.20.1.50
+      addr: 10.20.1.72
       backend: nccl
 
 dataset: # huggingface dataset
diff --git a/examples/llama3/static/mesh.yaml b/examples/llama3/static/mesh.yaml
@@ -0,0 +1,112 @@
+---
+name: llama3 linear example
+model: meta-llama/Meta-Llama-3.1-8B
+nfaults: 1
+# the following entries are for local development only
+# after development is done, they need to be revised accordingly
+# to automate building flow_graph, etc.
+micro_batch_size: 2
+fwd_policy: rr
+job_id: "job5"
+# maximum number of requests in flight at any given point in time
+max_inflight: 4
+
+# Note: IP addresses should be agents'
+flow_graph:
+  s-0:
+    - name: w4
+      peers: [2-0]
+      addr: 10.20.1.72
+      backend: nccl
+    - name: w9
+      peers: [5-0]
+      addr: 10.20.1.72
+      backend: nccl
+  0-0:
+    - name: w0
+      peers: [s-0]
+      addr: 10.20.1.72
+      backend: nccl
+  1-0:
+    - name: w1
+      peers: [0-0]
+      addr: 10.20.1.72
+      backend: nccl
+    - name: w10
+      peers: [3-0]
+      addr: 10.20.1.72
+      backend: nccl
+  2-0:
+    - name: w2
+      peers: [1-0]
+      addr: 10.20.1.72
+      backend: nccl
+    - name: w11
+      peers: [4-0]
+      addr: 10.20.1.72
+      backend: nccl
+  3-0:
+    - name: w5
+      peers: [s-0]
+      addr: 10.20.1.50
+      backend: nccl
+  4-0:
+    - name: w6
+      peers: [3-0]
+      addr: 10.20.1.50
+      backend: nccl
+    - name: w12
+      peers: [0-0]
+      addr: 10.20.1.50
+      backend: nccl
+  5-0:
+    - name: w7
+      peers: [4-0]
+      addr: 10.20.1.50
+      backend: nccl
+    - name: w13
+      peers: [1-0]
+      addr: 10.20.1.50
+      backend: nccl
+dataset: # huggingface dataset
+  path: fka/awesome-chatgpt-prompts
+  name: ""
+  split: train
+
+workers:
+  - id: s-0
+    device: cuda:0
+    is_server: True
+    stage:
+      start: -1
+      end: -1
+  - id: 0-0
+    device: cuda:1
+    stage:
+      start: 0
+      end: 10
+  - id: 1-0
+    device: cuda:2
+    stage:
+      start: 11
+      end: 23
+  - id: 2-0
+    device: cuda:3
+    stage:
+      start: 24
+      end: 34
+  - id: 3-0
+    device: cuda:1
+    stage:
+      start: 0
+      end: 10
+  - id: 4-0
+    device: cuda:2
+    stage:
+      start: 11
+      end: 23
+  - id: 5-0
+    device: cuda:3
+    stage:
+      start: 24
+      end: 34
diff --git a/examples/llama3/static/two_linear.yaml b/examples/llama3/static/two_linear.yaml
@@ -0,0 +1,97 @@
+---
+name: llama3 linear example
+model: meta-llama/Meta-Llama-3.1-8B
+nfaults: 1
+# the following entries are for local development only
+# after development is done, they need to be revised accordingly
+# to automate building flow_graph, etc.
+micro_batch_size: 2
+fwd_policy: rr
+job_id: "job5"
+# maximum number of requests in flight at any given point in time
+max_inflight: 4
+
+# Note: IP addresses should be agents'
+flow_graph:
+  s-0:
+    - name: w4
+      peers: [2-0]
+      addr: 10.20.1.72
+      backend: nccl
+    - name: w9
+      peers: [5-0]
+      addr: 10.20.1.72
+      backend: nccl
+  0-0:
+    - name: w0
+      peers: [s-0]
+      addr: 10.20.1.72
+      backend: nccl
+  1-0:
+    - name: w1
+      peers: [0-0]
+      addr: 10.20.1.72
+      backend: nccl
+  2-0:
+    - name: w2
+      peers: [1-0]
+      addr: 10.20.1.72
+      backend: nccl
+  3-0:
+    - name: w5
+      peers: [s-0]
+      addr: 10.20.1.50
+      backend: nccl
+  4-0:
+    - name: w6
+      peers: [3-0]
+      addr: 10.20.1.50
+      backend: nccl
+  5-0:
+    - name: w7
+      peers: [4-0]
+      addr: 10.20.1.50
+      backend: nccl
+
+dataset: # huggingface dataset
+  path: fka/awesome-chatgpt-prompts
+  name: ""
+  split: train
+
+workers:
+  - id: s-0
+    device: cuda:0
+    is_server: True
+    stage:
+      start: -1
+      end: -1
+  - id: 0-0
+    device: cuda:1
+    stage:
+      start: 0
+      end: 10
+  - id: 1-0
+    device: cuda:2
+    stage:
+      start: 11
+      end: 23
+  - id: 2-0
+    device: cuda:3
+    stage:
+      start: 24
+      end: 34
+  - id: 3-0
+    device: cuda:1
+    stage:
+      start: 0
+      end: 10
+  - id: 4-0
+    device: cuda:2
+    stage:
+      start: 11
+      end: 23
+  - id: 5-0
+    device: cuda:3
+    stage:
+      start: 24
+      end: 34
diff --git a/infscale/controller/cfggen.py b/infscale/controller/cfggen.py
@@ -841,19 +841,24 @@ def _build_flow_graph(self):
                             )
                             world_id += 1
 
-                    # For LLM(e.g., llama), add feedback connections from last stage to first stage
-                    if self._is_auto_regressive and i == 0 and len(stage_ids) > 1:
-                        last_stage_workers = stages[stage_ids[-1]]
-                        for last_worker in last_stage_workers:
-                            connections.append(
-                                {
-                                    "name": f"w{world_id}",
-                                    "peers": FlowList([last_worker["id"]]),
-                                    "addr": worker_addr,
-                                    "backend": "nccl",
-                                }
-                            )
-                            world_id += 1
+                    # WE DON'T NEED A FEEDBACK CONNECTION ANY MORE SINCE THE
+                    # DISPATCHER HANDLES THAT.
+                    # WE KEEP THE COMMENTED-OUT CODE JUST IN CASE WE HAVE TO
+                    # REVERT IT IN THE FUTURE
+                    #
+                    # # For LLM(e.g., llama), add feedback connections from last stage to first stage
+                    # if self._is_auto_regressive and i == 0 and len(stage_ids) > 1:
+                    #     last_stage_workers = stages[stage_ids[-1]]
+                    #     for last_worker in last_stage_workers:
+                    #         connections.append(
+                    #             {
+                    #                 "name": f"w{world_id}",
+                    #                 "peers": FlowList([last_worker["id"]]),
+                    #                 "addr": worker_addr,
+                    #                 "backend": "nccl",
+                    #             }
+                    #         )
+                    #         world_id += 1
 
                     flow_graph[worker_id] = connections
 
diff --git a/infscale/execution/router.py b/infscale/execution/router.py
@@ -66,6 +66,7 @@ def __init__(self, world_manager: WorldManager, mc: MetricsCollector):
         _ = asyncio.create_task(self._recv_arbiter())
 
         self._fwder: Forwarder = None
+        self._is_server = False
 
     @property
     def rx_q(self) -> asyncio.Queue:
@@ -115,6 +116,7 @@ async def configure(
         worlds_to_remove: list[WorldInfo] = [],
     ) -> None:
         """(Re)configure router."""
+        self._is_server = spec.is_server
         self.device = device
 
         if self._fwder is None:
@@ -314,8 +316,26 @@ async def _recv_arbiter(self) -> None:
         while True:
             try:
                 tensor, seqno = await self.__rx_q.get()
-                # TODO: introduce a prioritization policy
-                await self._rx_q.put((tensor, seqno))
+
+                if (
+                    self._is_server
+                    and self._fwder.is_sticky()
+                    and "tokens" not in tensor
+                ):
+                    # if router is configured for server (i.e., dispatcher),
+                    # we need to check the following:
+                    #
+                    # if the model is llm (i.e., is_sticky()), we need to check
+                    # if decoding is done (if "tokens" are in tensor or not):
+                    # if not, we have to return the tensor to the stage with
+                    # layer 0 to continue decoding
+                    # For that, we put the tensor back to _tx_q so that
+                    # send_arbiter() takes care of sending the tensor to
+                    # a correct stage (i.e., worker) whose start layer is 0.
+                    await self._tx_q.put((seqno, tensor, 0))
+                else:
+                    # TODO: introduce a prioritization policy
+                    await self._rx_q.put((tensor, seqno))
             except Exception as e:
                 # this is very likely to be a no-op due to the simple
                 # get and put operations we do on the asyncio queues.
diff --git a/infscale/execution/stage.py b/infscale/execution/stage.py
@@ -229,16 +229,7 @@ def _llm_generate(self, seqno: int, **inputs) -> tuple[dict[str, Tensor], int]:
         # a better way to handle this
         outputs.pop("past_key_values", None)
 
-        if self.is_last:
-            # if tokens are in the outputs, token generation is done.
-            # so, we can go back to the serving server
-            # otherwise, outputs need to be fed into layer 0
-            # due to auto-regressive nature of LLM's token generation
-            next_layer = -1 if "tokens" in outputs else 0
-        else:
-            # if it's not the last layer or stage, we have to send outputs to
-            # worker (or staage) that has the next layer
-            next_layer = self.end + 1
+        next_layer = -1 if self.is_last else self.end + 1
 
         return outputs, next_layer
 
diff --git a/infscale/fwding/base.py b/infscale/fwding/base.py