fix vllm configuration and load balancing

YeAnbang · YeAnbang · commit 8ca76fe935a8 · 2025-09-23T10:47:44.000+08:00
diff --git a/applications/ColossalChat/coati/distributed/agent/agentic_producer.py b/applications/ColossalChat/coati/distributed/agent/agentic_producer.py
@@ -52,6 +52,7 @@ def __init__(
         log_rollout_interval: int = 20,
         rollout_log_file: str = "./rollout_log.jsonl",
         enable_profiling: bool = False,
+        load_balancer=None,
         n_behind: int = 0,
     ):
         assert microbatch_size == 1  # microbatch_size must be 1 for agentic producer
@@ -84,6 +85,7 @@ def __init__(
             enable_profiling=enable_profiling,
             n_behind=n_behind,
         )
+        self.load_balancer = load_balancer
         self.tool_workers = tool_workers
         self.agentic_config = model_config if not agentic_config else agentic_config
         self.agentic_config.update({"model": model_config["path"]})
@@ -183,32 +185,26 @@ def _parse_response(self, response: str) -> Dict[str, Any]:
             assistant_message["tool_calls"] = tool_calls
         return assistant_message
 
-    def _select_tool_worker(self) -> ray.actor.ActorHandle:
+    def _select_tool_worker(self) -> int:
         """
         Select a tool worker based on the current load.
         """
-        loads = ray.get([worker.get_load.remote() for worker in self.tool_workers])
-        min_load = min(loads)
-        candidates = [i for i, l in enumerate(loads) if l == min_load]
-        selected_idx = random.choice(candidates)  # random tie break
-        ray.get(self.tool_workers[selected_idx].increase_load.remote())
-        return self.tool_workers[selected_idx]
+        selected_idx, current_loads = ray.get(self.load_balancer.get_next_worker.remote("tool", amount=1))
+        return selected_idx
 
-    def _select_async_producer(self, request_id) -> ray.actor.ActorHandle:
+    def _select_async_producer(self, request_id) -> int:
         """
         Select an async producer based on the current load.
         """
         # use the last used async producer if exists to reuse kv cache (as vllm use paged kv cache,
         # it will reuse most of the kv cache pages without recomputation)
         if request_id in self.async_llm_engine_map:
-            return self.async_producers[self.async_llm_engine_map[request_id]]
+            ray.get(self.load_balancer.increase_load.remote("async-llm", self.async_llm_engine_map[request_id], 1))
+            return self.async_llm_engine_map[request_id]
         # otherwise select the least loaded async producer
-        loads = ray.get([proc.get_producer_load.remote() for proc in self.async_producers])
-        min_load = min(loads)
-        candidates = [i for i, l in enumerate(loads) if l == min_load]
-        selected_idx = random.choice(candidates)  # random tie break
+        selected_idx, current_loads = ray.get(self.load_balancer.get_next_worker.remote("async-llm", amount=1))
         self.async_llm_engine_map[request_id] = selected_idx
-        return self.async_producers[selected_idx]
+        return selected_idx
 
     def _run_agentic_pipeline(self, messages):
         """
@@ -234,7 +230,7 @@ def _run_agentic_pipeline(self, messages):
                 )
                 del self.async_llm_engine_map[request_id]
                 return messages, response_input_ids, logprobs
-            async_producer = self._select_async_producer(request_id=request_id)
+            async_producer = self.async_producers[self._select_async_producer(request_id=request_id)]
             agentic_generate_config = copy.deepcopy(self.generate_config)
             agentic_generate_config["max_tokens"] = self.agentic_config.get("max_tokens", 2048)
             response = ray.get(
@@ -246,6 +242,7 @@ def _run_agentic_pipeline(self, messages):
                 )
             )
             llm_call_count += 1
+            ray.get(self.load_balancer.decrease_load.remote("async-llm", self.async_llm_engine_map[request_id], 1))
             self.consumer_global_step = response.pop("consumer_global_step")
             response_input_ids = response["input_ids"]
             logprobs = response["action_log_probs"]
@@ -261,12 +258,17 @@ def _run_agentic_pipeline(self, messages):
                     return messages, response_input_ids, logprobs
                 tool_call_count += len(assistant_message["tool_calls"])
                 handlers = []
+                tool_workers_called = []
                 for tool_call in assistant_message["tool_calls"]:
                     # select a tool worker to execute the tool call
-                    tool_worker = self._select_tool_worker()
+                    tool_worker_idx = self._select_tool_worker()
+                    tool_workers_called.append(tool_worker_idx)
+                    tool_worker = self.tool_workers[tool_worker_idx]
                     handler = tool_worker.call.remote(tool_call["function"]["name"], tool_call["function"]["arguments"])
                     handlers.append(handler)
                 tool_results = ray.get(handlers)
+                for idx in tool_workers_called:
+                    ray.get(self.load_balancer.decrease_load.remote("tool", idx, 1))
                 for tool_call, tool_result in zip(assistant_message["tool_calls"], tool_results):
                     tool_message = {"role": "tool", "content": str(tool_result)}
                     messages.append(tool_message)
diff --git a/applications/ColossalChat/coati/distributed/agent/tool_worker.py b/applications/ColossalChat/coati/distributed/agent/tool_worker.py
@@ -19,17 +19,6 @@ def __init__(self, tools: List[BaseTool]):
             tools (List[BaseTool]): List of LangChain tools to register.
         """
         self._tool_registry: Dict[str, BaseTool] = {tool.name: tool for tool in tools}
-        self.pending = 0
-
-    @ray.method(concurrency_group="io")
-    def get_load(self) -> int:
-        """Return the current load of the worker."""
-        return self.pending
-
-    @ray.method(concurrency_group="io")
-    def increase_load(self):
-        """Increase the load counter."""
-        self.pending += 1
 
     @ray.method(concurrency_group="io")
     def list_tools(self) -> List[str]:
@@ -64,7 +53,6 @@ def call(self, tool_name: str, input_data: Union[str, Dict[str, Any]], **kwargs)
             Any: The tool's output.
         """
         if tool_name == "return_parsing_error":
-            self.pending -= 1
             return "Error: Tool call parsing error. Please use the correct JSON format."
         if tool_name not in self._tool_registry:
             return f"Error: Tool {tool_name} not found. Available tools: {self.list_tools()}"
@@ -73,5 +61,4 @@ def call(self, tool_name: str, input_data: Union[str, Dict[str, Any]], **kwargs)
             ret = tool.run(input_data, **kwargs)
         except Exception as e:
             ret = f"Error: Tool {tool_name} execution failed with error: {str(e)}"
-        self.pending -= 1
         return ret
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
@@ -10,6 +10,7 @@
 from .consumer import SimpleConsumer
 from .grpo_consumer import GRPOConsumer
 from .producer import AsyncSimpleProducer, SimpleProducer
+from .utils import LoadBalancer
 
 ALGO_MAP = {
     "Simple": SimpleConsumer,
@@ -86,7 +87,7 @@ def launch_distributed(
     num_samples = get_jsonl_size_fast(dataset_path)
     global_inference_batch_size = inference_batch_size * num_producers
     num_update_per_episode = num_samples // global_inference_batch_size
-    num_recv_per_update = inference_batch_size // inference_microbatch_size if "async" not in inference_backend else 1
+    num_recv_per_update = inference_batch_size // inference_microbatch_size if "async-agentic" not in inference_backend else 1
 
     run_name = f"{inference_backend}_bs_{train_batch_size * train_dp_size}_temp_{generate_config['temperature']:.01f}_top_p_{generate_config['top_p']:.02f}"
     wandb_group_name = str(uuid.uuid4())
@@ -124,6 +125,7 @@ def launch_distributed(
     enable_agentic = "agentic" in inference_backend
     if enable_agentic:
         inference_backend = inference_backend.replace("agentic-", "")
+        inference_microbatch_size = inference_microbatch_size * num_generations
     for i in range(num_producers):
         node_id = gpu_to_node_id[0]
         producer_ip_address = gpu_to_ip_address[0]
@@ -141,11 +143,7 @@ def launch_distributed(
             model_config=inference_model_config,
             generate_config=generate_config,
             tokenizer_config=tokenizer_config,
-            microbatch_size=(
-                inference_microbatch_size * num_generations
-                if "async-agentic" in inference_backend
-                else inference_microbatch_size
-            ),
+            microbatch_size=inference_microbatch_size,
             backend=inference_backend,
             num_generations=num_generations,
             consumer_plugin_config=plugin_config,
@@ -183,6 +181,7 @@ def launch_distributed(
         assert (
             agentic_config["agentic_producer"] in AGENTIC_PRODUCER_MAP
         ), f"Only {list(AGENTIC_PRODUCER_MAP.keys())} are supported as agentic producer so far."
+        load_balancer = LoadBalancer.remote({"tool": len(tool_workers), "async-llm": num_producers})
         agentic_producer_cls = AGENTIC_PRODUCER_MAP[agentic_config["agentic_producer"]]
         agentic_config.pop("agentic_producer")
         producer_procs = [
@@ -214,6 +213,7 @@ def launch_distributed(
                 log_rollout_interval=log_rollout_interval,
                 rollout_log_file=rollout_log_file,
                 enable_profiling=enable_profiling,
+                load_balancer=load_balancer,
                 n_behind=n_behind,
             )
             for producer_idx in range(num_producers * inference_batch_size)
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
@@ -636,12 +636,6 @@ async def rollout(self, input_ids, attention_mask, **kwargs):
         """
         raise NotImplementedError("rollout must be implemented in subclasses")
 
-    async def get_producer_load(self):
-        """
-        Get the load of each producer.
-        """
-        return len(self.model.running_requests)
-
     async def async_sync_model(self, episode, step, num_processes: int = 1) -> None:
         """
         Asyncronous version to sync model from consumer to producer.
@@ -853,7 +847,6 @@ class AsyncSimpleProducer(BaseAsyncProducer):
     Asyncronous version of the producer that uses vLLM for generation.
     This class is designed to handle multiple producer actors and distribute tasks among them.
     """
-
     @torch.no_grad()
     async def rollout(self, input_ids, attention_mask, **kwargs):
         # naive rollout strategy without load balancing
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
@@ -1,12 +1,12 @@
 import json
 import os
 from typing import Any, Dict, List
-
+import asyncio
 import torch
 from filelock import FileLock
-
+import random
 from colossalai.shardformer.layer.loss import dist_log_prob
-
+import ray
 
 def unbind_batch(batch: Dict[str, torch.Tensor]) -> List[Dict[str, torch.Tensor]]:
     batches = []
@@ -165,3 +165,25 @@ def safe_append_to_jsonl_file(file_path, data):
             for entry in data:
                 json_line = json.dumps(entry, ensure_ascii=False)
                 f.write(json_line + "\n")
+
+@ray.remote
+class LoadBalancer:
+    def __init__(self, worker_counts):
+        self.load = {}
+        for type in worker_counts:
+            self.load[type] = {k: 0 for k in range(worker_counts[type])}
+
+    def get_next_worker(self, worker_type, amount=1):
+        loads = [(k, v) for k, v in self.load[worker_type].items()]
+        min_load = min(loads, key=lambda x: x[1])
+        candidates = [k for k, v in loads if v == min_load[1]]
+        chosen = random.choice(candidates)
+        self.load[worker_type][chosen] += amount
+        return chosen, self.load[worker_type]
+    
+    def increase_load(self, worker_type, worker_id, amount=1):
+        self.load[worker_type][worker_id] += amount
+        
+    def decrease_load(self, worker_type, worker_id, amount=1):
+        self.load[worker_type][worker_id] -= amount
+        
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
@@ -281,8 +281,10 @@
         # os.environ["VLLM_DP_SIZE"] = str(args.producer_data_parallel_size)
         inference_model_config.update(
             dict(
-                gpu_memory_utilization=0.7,
-                enforce_eager=True,
+                gpu_memory_utilization=0.8,
+                max_num_batched_tokens=4096,
+                max_num_seqs=1024,
+                enforce_eager=False,
                 enable_chunked_prefill=True,
                 max_model_len=args.max_new_tokens + args.max_prompt_tokens,
                 tensor_parallel_size=args.producer_tensor_parallel_size,