[grpo] support vllm_server_base_url for vLLMClient (#4449)

hjh0119 · web-flow · commit d3b6b56223ec · 2025-06-03T15:58:48.000+08:00
diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
@@ -209,6 +209,7 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 - use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend，默认为False。
 - vllm_mode: vLLM 集成模式，可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样，colocate 模式在程序内部署 vLLM。
 - vllm_mode server 参数
+  - vllm_server_base_url: vLLM server的Base URL(比如 http://local_host:8000), 默认为None。设置后，忽略host和port设置。
   - vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用.
   - vllm_server_port vLLM server 服务端口，默认为8000.
   - vllm_server_timeout 连接vLLM server的超时时间，默认为120s.
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -430,6 +430,7 @@ reward模型参数将在PPO、GRPO中使用。
 - use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend，默认为False。
 - vllm_mode: vLLM 集成模式，可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样，colocate 模式在程序内部署 vLLM。使用server端时，
 - vllm_mode server 参数
+  - vllm_server_base_url: vLLM server的Base URL(比如 http://local_host:8000), 默认为None。设置后，忽略host和port设置。
   - vllm_server_host：vLLM server host地址，默认为None，使用外部vLLM server时使用。
   - vllm_server_port vLLM server 服务端口，默认为8000。
   - vllm_server_timeout 连接vLLM server的超时时间，默认为120s。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -442,6 +442,7 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
 - vllm_mode: Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `server` or `colocate`
 - vllm_mode server parameter
+  - vllm_server_base_url: Base URL for the vLLM server (e.g., 'http://localhost:8000'). If provided, `vllm_server_host` " "and `vllm_server_port` are ignored. Default is None.
   - vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
   - vllm_server_port: The service port of the vLLM server. Default is 8000.
   - vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
@@ -219,6 +219,7 @@ Arguments
 - use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
 - vllm_mode: Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `server` or `colocate`
 - vllm_mode server parameter
+  - vllm_server_base_url: Base URL for the vLLM server (e.g., 'http://localhost:8000'). If provided, `vllm_server_host` " "and `vllm_server_port` are ignored. Default is None.
   - vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
   - vllm_server_port: The service port of the vLLM server. Default is 8000.
   - vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
@@ -169,6 +169,7 @@ class GRPOArgumentsMixin:
     vllm_enable_prefix_caching: bool = True
     vllm_tensor_parallel_size: int = 1
     # external vllm (server)
+    vllm_server_base_url: Optional[str] = None
     vllm_server_host: Optional[str] = None
     vllm_server_port: int = 8000
     vllm_server_timeout: float = 240.0
diff --git a/swift/trainers/rlhf_trainer/vllm_client.py b/swift/trainers/rlhf_trainer/vllm_client.py
@@ -4,8 +4,10 @@
 
 import atexit
 import logging
+import socket
 import time
 from typing import List, Optional
+from urllib.parse import urlparse
 
 import requests
 import torch
@@ -36,10 +38,13 @@ class VLLMClient:
     weights in a distributed setting. Before using it, start the vLLM server with `trl vllm-serve`.
 
     Args:
+        base_url (`str` or `None`, *optional*, defaults to `None`):
+            Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `host` and `server_port` are
+            ignored.
         host (`str`, *optional*, defaults to `"0.0.0.0"`):
-            IP address of the vLLM server.
+            IP address of the vLLM server. Ignored if `base_url` is provided.
         server_port (`int`, *optional*, defaults to `8000`):
-            Port number of the vLLM server.
+            Port number of the vLLM server. Ignored if `base_url` is provided.
         group_port (`int`, *optional*, defaults to `51216`):
             Port number for the weight update group.
         connection_timeout (`float`, *optional*, defaults to `0.0`):
@@ -48,6 +53,7 @@ class VLLMClient:
     """
 
     def __init__(self,
+                 base_url: Optional[str] = None,
                  host: str = '0.0.0.0',
                  server_port: int = 8000,
                  group_port: int = 51216,
@@ -56,8 +62,17 @@ def __init__(self,
             raise ImportError('vLLM is not installed. Please install it with `pip install vllm`.')
 
         self.session = requests.Session()
-        self.host = host
-        self.server_port = server_port
+        if base_url is not None:
+            # Parse the base_url to extract host and port
+            parsed_url = urlparse(base_url)
+            self.host = socket.gethostbyname(parsed_url.hostname)
+            scheme = parsed_url.scheme or 'http'
+            self.base_url = f'{scheme}://{parsed_url.netloc}{parsed_url.path}'
+        else:
+            self.host = host
+            self.server_port = server_port
+            self.base_url = f'http://{self.host}:{self.server_port}'
+
         self.group_port = group_port
         self.check_server(connection_timeout)  # check server and fail after timeout
 
@@ -72,7 +87,7 @@ def check_server(self, total_timeout: float = 0.0, retry_interval: float = 2.0):
             total_timeout (`float`, *optional*, defaults to `0.0`):
                 Total timeout duration in seconds.
         """
-        url = f'http://{self.host}:{self.server_port}/health/'
+        url = f'{self.base_url}/health/'
         start_time = time.time()  # Record the start time
 
         while True:
@@ -83,10 +98,12 @@ def check_server(self, total_timeout: float = 0.0, retry_interval: float = 2.0):
                 elapsed_time = time.time() - start_time
                 if elapsed_time >= total_timeout:
                     raise ConnectionError(
-                        f"The vLLM server can't be reached at {self.host}:{self.server_port} after {total_timeout} "
-                        'seconds. Make sure the server is running by running `swift deploy`.') from exc
+                        f"The vLLM server can't be reached at {self.base_url} after {total_timeout} seconds. Make "
+                        'sure the server is running by running `trl vllm-serve`.') from exc
             else:
                 if response.status_code == 200:
+                    if 'X-Forwarded-For' in response.headers:
+                        self.host = response.headers['X-Forwarded-For']
                     logger.info('Server is up!')
                     return None
 
@@ -104,7 +121,7 @@ def infer(
         use_tqdm: Optional[bool] = None,
         adapter_request: Optional[AdapterRequest] = None,
     ):
-        url = f'http://{self.host}:{self.server_port}/infer/'
+        url = f'{self.base_url}/generate/'
         response = self.session.post(
             url,
             json={
@@ -126,7 +143,7 @@ def init_communicator(self):
         Initializes the weight update group in a distributed setup for model synchronization.
         """
         # Get the tensor parallel size from the server
-        url = f'http://{self.host}:{self.server_port}/get_world_size/'
+        url = f'{self.base_url}/get_world_size/'
         response = requests.get(url)
         if response.status_code == 200:
             vllm_world_size = response.json()['world_size']
@@ -137,7 +154,7 @@ def init_communicator(self):
         self.rank = vllm_world_size  # the client's rank is the last process
 
         # Initialize weight update group
-        url = f'http://{self.host}:{self.server_port}/init_communicator/'
+        url = f'{self.base_url}/init_communicator/'
         # In the server side, the host is set to 0.0.0.0
         response = self.session.post(url, json={'host': '0.0.0.0', 'port': self.group_port, 'world_size': world_size})
         if response.status_code != 200:
@@ -166,7 +183,7 @@ def update_named_param(self, name: str, weights: torch.Tensor):
                 Tensor containing the updated weights.
         """
         dtype, shape = str(weights.dtype), tuple(weights.shape)
-        url = f'http://{self.host}:{self.server_port}/update_named_param/'
+        url = f'{self.base_url}/update_named_param/'
         response = self.session.post(url, json={'name': name, 'dtype': dtype, 'shape': shape})
         if response.status_code != 200:
             raise Exception(f'Request failed: {response.status_code}, {response.text}')
@@ -191,7 +208,7 @@ def reset_prefix_cache(self):
         """
         Resets the prefix cache for the model.
         """
-        url = f'http://{self.host}:{self.server_port}/reset_prefix_cache/'
+        url = f'{self.base_url}/reset_prefix_cache/'
         response = self.session.post(url)
         if response.status_code != 200:
             raise Exception(f'Request failed: {response.status_code}, {response.text}')
@@ -200,7 +217,7 @@ def close_communicator(self):
         """
         Closes the weight update group and cleans up the communication group.
         """
-        url = f'http://{self.host}:{self.server_port}/close_communicator/'
+        url = f'{self.base_url}/close_communicator/'
 
         try:
             response = self.session.post(url)